diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml index f84beff20c52..11adc8df86ec 100644 --- a/.github/actions/allure-report-generate/action.yml +++ b/.github/actions/allure-report-generate/action.yml @@ -183,7 +183,7 @@ runs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Store Allure test stat in the DB (new) if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }} diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml index d5c1fcf524de..daaedf6d11d2 100644 --- a/.github/actions/run-python-test-set/action.yml +++ b/.github/actions/run-python-test-set/action.yml @@ -56,14 +56,14 @@ runs: if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon - name: Download Neon binaries for the previous release if: inputs.build_type != 'remote' uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact path: /tmp/neon-previous prefix: latest @@ -89,7 +89,7 @@ runs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps shell: bash -euxo pipefail {0} @@ -114,6 +114,8 @@ runs: export PLATFORM=${PLATFORM:-github-actions-selfhosted} export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install} export DEFAULT_PG_VERSION=${PG_VERSION#v} + export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib + export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-} if [ "${BUILD_TYPE}" = "remote" ]; then export REMOTE_ENV=1 @@ -178,7 +180,15 @@ runs: # Wake up the cluster if we use remote neon instance if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();" + QUERIES=("SELECT version()") + if [[ "${PLATFORM}" = "neon"* ]]; then + QUERIES+=("SHOW neon.tenant_id") + QUERIES+=("SHOW neon.timeline_id") + fi + + for q in "${QUERIES[@]}"; do + ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}" + done fi # Run the tests. diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml index 9eff4836809d..899cae2b8658 100644 --- a/.github/workflows/benchmarking.yml +++ b/.github/workflows/benchmarking.yml @@ -77,7 +77,7 @@ jobs: - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest @@ -99,7 +99,14 @@ jobs: # Set --sparse-ordering option of pytest-order plugin # to ensure tests are running in order of appears in the file. # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests - extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py + extra_params: + -m remote_cluster + --sparse-ordering + --timeout 5400 + --ignore test_runner/performance/test_perf_olap.py + --ignore test_runner/performance/test_perf_pgvector_queries.py + --ignore test_runner/performance/test_logical_replication.py + --ignore test_runner/performance/test_physical_replication.py env: BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" @@ -125,6 +132,69 @@ jobs: env: SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + replication-tests: + env: + POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install + DEFAULT_PG_VERSION: 14 + TEST_OUTPUT: /tmp/test_output + BUILD_TYPE: remote + SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }} + PLATFORM: "neon-staging" + + runs-on: [ self-hosted, us-east-2, x64 ] + container: + image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned + options: --init + + steps: + - uses: actions/checkout@v4 + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Run benchmark + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_logical_replication.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 5400 + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Run benchmark + uses: ./.github/actions/run-python-test-set + with: + build_type: ${{ env.BUILD_TYPE }} + test_selection: performance/test_physical_replication.py + run_in_parallel: false + save_perf_report: ${{ env.SAVE_PERF_REPORT }} + extra_params: -m remote_cluster --timeout 5400 + env: + VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" + PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" + NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + generate-matrices: if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }} # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday) @@ -235,15 +305,10 @@ jobs: - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Create Neon Project if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform) id: create-neon-project @@ -282,16 +347,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - fi - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done - - name: Benchmark init uses: ./.github/actions/run-python-test-set with: @@ -373,29 +428,16 @@ jobs: - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Set up Connection String id: set-up-connstr run: | CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }} - - echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done + echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - name: Benchmark pgvector hnsw indexing uses: ./.github/actions/run-python-test-set @@ -417,12 +459,12 @@ jobs: test_selection: performance/test_perf_pgvector_queries.py run_in_parallel: false save_perf_report: ${{ env.SAVE_PERF_REPORT }} - extra_params: -m remote_cluster --timeout 21600 + extra_params: -m remote_cluster --timeout 21600 env: BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }} VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}" PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" - + - name: Create Allure report if: ${{ !cancelled() }} uses: ./.github/actions/allure-report-generate @@ -473,15 +515,10 @@ jobs: - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Set up Connection String id: set-up-connstr run: | @@ -503,16 +540,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - fi - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done - - name: ClickBench benchmark uses: ./.github/actions/run-python-test-set with: @@ -576,15 +603,10 @@ jobs: - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Get Connstring Secret Name run: | case "${PLATFORM}" in @@ -613,16 +635,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - fi - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done - - name: Run TPC-H benchmark uses: ./.github/actions/run-python-test-set with: @@ -677,15 +689,10 @@ jobs: - name: Download Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-release-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact path: /tmp/neon/ prefix: latest - - name: Add Postgres binaries to PATH - run: | - ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version - echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH - - name: Set up Connection String id: set-up-connstr run: | @@ -707,16 +714,6 @@ jobs: echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT - QUERIES=("SELECT version()") - if [[ "${PLATFORM}" = "neon"* ]]; then - QUERIES+=("SHOW neon.tenant_id") - QUERIES+=("SHOW neon.timeline_id") - fi - - for q in "${QUERIES[@]}"; do - psql ${CONNSTR} -c "${q}" - done - - name: Run user examples uses: ./.github/actions/run-python-test-set with: diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml index 5a94dd8e6f2d..a69686bf2a6e 100644 --- a/.github/workflows/build-build-tools-image.yml +++ b/.github/workflows/build-build-tools-image.yml @@ -63,14 +63,16 @@ jobs: mkdir -p /tmp/.docker-custom echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV - - uses: docker/setup-buildx-action@v2 + - uses: docker/setup-buildx-action@v3 + with: + cache-binary: false - - uses: docker/login-action@v2 + - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/build-push-action@v4 + - uses: docker/build-push-action@v6 with: context: . provenance: false @@ -82,6 +84,7 @@ jobs: tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }} - name: Remove custom docker config directory + if: always() run: | rm -rf /tmp/.docker-custom diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 87f04996fd81..cb7655e03908 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -30,7 +30,7 @@ jobs: if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} uses: ./.github/workflows/check-permissions.yml with: - github-event-name: ${{ github.event_name}} + github-event-name: ${{ github.event_name }} cancel-previous-e2e-tests: needs: [ check-permissions ] @@ -109,7 +109,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -149,7 +149,7 @@ jobs: # !~/.cargo/registry/src # ~/.cargo/git/ # target/ -# key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} +# key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} # Some of our rust modules use FFI and need those to be checked - name: Get postgres headers @@ -291,29 +291,29 @@ jobs: # target/ # # Fall back to older versions of the key, if no cache for current Cargo.lock was found # key: | -# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} -# v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}- +# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }} +# v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}- - name: Cache postgres v14 build id: cache_pg_14 uses: actions/cache@v4 with: path: pg_install/v14 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v15 build id: cache_pg_15 uses: actions/cache@v4 with: path: pg_install/v15 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Cache postgres v16 build id: cache_pg_16 uses: actions/cache@v4 with: path: pg_install/v16 - key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }} - name: Build postgres v14 if: steps.cache_pg_14.outputs.cache-hit != 'true' @@ -335,6 +335,8 @@ jobs: - name: Run cargo build run: | + PQ_LIB_DIR=$(pwd)/pg_install/v16/lib + export PQ_LIB_DIR ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests # Do install *before* running rust tests because they might recompile the @@ -383,6 +385,11 @@ jobs: env: NEXTEST_RETRIES: 3 run: | + PQ_LIB_DIR=$(pwd)/pg_install/v16/lib + export PQ_LIB_DIR + LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib + export LD_LIBRARY_PATH + #nextest does not yet support running doctests cargo test --doc $CARGO_FLAGS $CARGO_FEATURES @@ -411,7 +418,7 @@ jobs: - name: Upload Neon artifact uses: ./.github/actions/upload with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact path: /tmp/neon # XXX: keep this after the binaries.list is formed, so the coverage can properly work later @@ -490,7 +497,7 @@ jobs: uses: actions/cache@v4 with: path: ~/.cache/pypoetry/virtualenvs - key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }} + key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }} - name: Install Python deps run: ./scripts/pysync @@ -639,7 +646,7 @@ jobs: - name: Get Neon artifact uses: ./.github/actions/download with: - name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact + name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact path: /tmp/neon - name: Get coverage artifact @@ -744,14 +751,16 @@ jobs: run: | mkdir -p .docker-custom echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - - uses: docker/setup-buildx-action@v2 + - uses: docker/setup-buildx-action@v3 + with: + cache-binary: false - uses: docker/login-action@v3 with: username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} - - uses: docker/build-push-action@v5 + - uses: docker/build-push-action@v6 with: context: . build-args: | @@ -822,11 +831,12 @@ jobs: run: | mkdir -p .docker-custom echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV - - uses: docker/setup-buildx-action@v2 + - uses: docker/setup-buildx-action@v3 with: + cache-binary: false # Disable parallelism for docker buildkit. # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner. - config-inline: | + buildkitd-config-inline: | [worker.oci] max-parallelism = 1 @@ -842,7 +852,7 @@ jobs: password: ${{ secrets.AWS_SECRET_KEY_DEV }} - name: Build compute-node image - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: . build-args: | @@ -861,7 +871,7 @@ jobs: - name: Build neon extensions test image if: matrix.version == 'v16' - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: context: . build-args: | @@ -882,7 +892,7 @@ jobs: - name: Build compute-tools image # compute-tools are Postgres independent, so build it only once if: matrix.version == 'v16' - uses: docker/build-push-action@v5 + uses: docker/build-push-action@v6 with: target: compute-tools-image context: . @@ -1326,6 +1336,7 @@ jobs: env: BUCKET: neon-github-public-dev PREFIX: artifacts/latest + COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} run: | # Update compatibility snapshot for the release for pg_version in v14 v15 v16; do @@ -1339,8 +1350,8 @@ jobs: # Update Neon artifact for the release (reuse already uploaded artifact) for build_type in debug release; do - OLD_PREFIX=artifacts/${GITHUB_RUN_ID} - FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst + OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID} + FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true) if [ -z "${S3_KEY}" ]; then @@ -1358,3 +1369,31 @@ jobs: with: from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }} secrets: inherit + + # This job simplifies setting branch protection rules (in GitHub UI) + # by allowing to set only this job instead of listing many others. + # It also makes it easier to rename or parametrise jobs (using matrix) + # which requires changes in branch protection rules + # + # Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that. + # + # https://github.com/neondatabase/neon/settings/branch_protection_rules + conclusion: + if: always() + # Format `needs` differently to make the list more readable. + # Usually we do `needs: [...]` + needs: + - check-codestyle-python + - check-codestyle-rust + - regress-tests + - test-images + runs-on: ubuntu-22.04 + steps: + # The list of possible results: + # https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context + - name: Fail the job if any of the dependencies do not succeed + run: exit 1 + if: | + contains(needs.*.result, 'failure') + || contains(needs.*.result, 'cancelled') + || contains(needs.*.result, 'skipped') diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml index 7d2187e59cd5..11ff634b6c65 100644 --- a/.github/workflows/neon_extra_builds.yml +++ b/.github/workflows/neon_extra_builds.yml @@ -232,12 +232,19 @@ jobs: - name: Run cargo build run: | + PQ_LIB_DIR=$(pwd)/pg_install/v16/lib + export PQ_LIB_DIR mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc) - name: Run cargo test env: NEXTEST_RETRIES: 3 run: | + PQ_LIB_DIR=$(pwd)/pg_install/v16/lib + export PQ_LIB_DIR + LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib + export LD_LIBRARY_PATH + cargo nextest run $CARGO_FEATURES -j$(nproc) # Run separate tests for real S3 @@ -378,7 +385,7 @@ jobs: run: make walproposer-lib -j$(nproc) - name: Produce the build stats - run: cargo build --all --release --timings -j$(nproc) + run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc) - name: Upload the build stats id: upload-stats diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml new file mode 100644 index 000000000000..ed4e6be71239 --- /dev/null +++ b/.github/workflows/periodic_pagebench.yml @@ -0,0 +1,155 @@ +name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '0 18 * * *' # Runs at 6 PM UTC every day + workflow_dispatch: # Allows manual triggering of the workflow + inputs: + commit_hash: + type: string + description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.' + required: false + default: '' + +defaults: + run: + shell: bash -euo pipefail {0} + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false + +jobs: + trigger_bench_on_ec2_machine_in_eu_central_1: + runs-on: [ self-hosted, gen3, small ] + container: + image: neondatabase/build-tools:pinned + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init + timeout-minutes: 360 # Set the timeout to 6 hours + env: + API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }} + RUN_ID: ${{ github.run_id }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }} + AWS_DEFAULT_REGION : "eu-central-1" + AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74" + steps: + # we don't need the neon source code because we run everything remotely + # however we still need the local github actions to run the allure step below + - uses: actions/checkout@v4 + + - name: Show my own (github runner) external IP address - usefull for IP allowlisting + run: curl https://ifconfig.me + + - name: Start EC2 instance and wait for the instance to boot up + run: | + aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID + aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID + sleep 60 # sleep some time to allow cloudinit and our API server to start up + + - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US + run: | + public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text) + echo "Public IP of the EC2 instance: $public_ip" + echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV + + - name: Determine commit hash + env: + INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }} + run: | + if [ -z "$INPUT_COMMIT_HASH" ]; then + echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV + else + echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV + fi + + - name: Start Bench with run_id + run: | + curl -k -X 'POST' \ + "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H 'Content-Type: application/json' \ + -H "Authorization: Bearer $API_KEY" \ + -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}" + + - name: Poll Test Status + id: poll_step + run: | + status="" + while [[ "$status" != "failure" && "$status" != "success" ]]; do + response=$(curl -k -X 'GET' \ + "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer $API_KEY") + echo "Response: $response" + set +x + status=$(echo $response | jq -r '.status') + echo "Test status: $status" + if [[ "$status" == "failure" ]]; then + echo "Test failed" + exit 1 # Fail the job step if status is failure + elif [[ "$status" == "success" || "$status" == "null" ]]; then + break + elif [[ "$status" == "too_many_runs" ]]; then + echo "Too many runs already running" + echo "too_many_runs=true" >> "$GITHUB_OUTPUT" + exit 1 + fi + + sleep 60 # Poll every 60 seconds + done + + - name: Retrieve Test Logs + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + run: | + curl -k -X 'GET' \ + "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \ + -H 'accept: application/gzip' \ + -H "Authorization: Bearer $API_KEY" \ + --output "test_log_${GITHUB_RUN_ID}.gz" + + - name: Unzip Test Log and Print it into this job's log + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + run: | + gzip -d "test_log_${GITHUB_RUN_ID}.gz" + cat "test_log_${GITHUB_RUN_ID}" + + - name: Create Allure report + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + if: ${{ !cancelled() }} + uses: ./.github/actions/allure-report-generate + + - name: Post to a Slack channel + if: ${{ github.event.schedule && failure() }} + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C033QLM5P7D" # dev-staging-stream + slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} + + - name: Cleanup Test Resources + if: always() + run: | + curl -k -X 'POST' \ + "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \ + -H 'accept: application/json' \ + -H "Authorization: Bearer $API_KEY" \ + -d '' + + - name: Stop EC2 instance and wait for the instance to be stopped + if: always() && steps.poll_step.outputs.too_many_runs != 'true' + run: | + aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID + aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml new file mode 100644 index 000000000000..e21e45c929c8 --- /dev/null +++ b/.github/workflows/pg-clients.yml @@ -0,0 +1,115 @@ +name: Test Postgres client libraries + +on: + schedule: + # * is a special character in YAML so you have to quote this string + # ┌───────────── minute (0 - 59) + # │ ┌───────────── hour (0 - 23) + # │ │ ┌───────────── day of the month (1 - 31) + # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) + # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) + - cron: '23 02 * * *' # run once a day, timezone is utc + pull_request: + paths: + - '.github/workflows/pg-clients.yml' + - 'test_runner/pg_clients/**' + - 'poetry.lock' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref_name }} + cancel-in-progress: ${{ github.event_name == 'pull_request' }} + +defaults: + run: + shell: bash -euxo pipefail {0} + +env: + DEFAULT_PG_VERSION: 16 + PLATFORM: neon-captest-new + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }} + AWS_DEFAULT_REGION: eu-central-1 + +jobs: + check-permissions: + if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }} + uses: ./.github/workflows/check-permissions.yml + with: + github-event-name: ${{ github.event_name }} + + check-build-tools-image: + needs: [ check-permissions ] + uses: ./.github/workflows/check-build-tools-image.yml + + build-build-tools-image: + needs: [ check-build-tools-image ] + uses: ./.github/workflows/build-build-tools-image.yml + with: + image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }} + secrets: inherit + + test-postgres-client-libs: + needs: [ build-build-tools-image ] + runs-on: ubuntu-22.04 + + container: + image: ${{ needs.build-build-tools-image.outputs.image }} + credentials: + username: ${{ secrets.NEON_DOCKERHUB_USERNAME }} + password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }} + options: --init --user root + + steps: + - uses: actions/checkout@v4 + + - name: Download Neon artifact + uses: ./.github/actions/download + with: + name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact + path: /tmp/neon/ + prefix: latest + + - name: Create Neon Project + id: create-neon-project + uses: ./.github/actions/neon-project-create + with: + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + postgres_version: ${{ env.DEFAULT_PG_VERSION }} + + - name: Run tests + uses: ./.github/actions/run-python-test-set + with: + build_type: remote + test_selection: pg_clients + run_in_parallel: false + extra_params: -m remote_cluster + pg_version: ${{ env.DEFAULT_PG_VERSION }} + env: + BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} + + - name: Delete Neon Project + if: always() + uses: ./.github/actions/neon-project-delete + with: + project_id: ${{ steps.create-neon-project.outputs.project_id }} + api_key: ${{ secrets.NEON_STAGING_API_KEY }} + + - name: Create Allure report + if: ${{ !cancelled() }} + id: create-allure-report + uses: ./.github/actions/allure-report-generate + with: + store-test-results-into-db: true + env: + REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }} + + - name: Post to a Slack channel + if: github.event.schedule && failure() + uses: slackapi/slack-github-action@v1 + with: + channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream + slack-message: | + Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>) + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml deleted file mode 100644 index fef3aec754b2..000000000000 --- a/.github/workflows/pg_clients.yml +++ /dev/null @@ -1,98 +0,0 @@ -name: Test Postgres client libraries - -on: - schedule: - # * is a special character in YAML so you have to quote this string - # ┌───────────── minute (0 - 59) - # │ ┌───────────── hour (0 - 23) - # │ │ ┌───────────── day of the month (1 - 31) - # │ │ │ ┌───────────── month (1 - 12 or JAN-DEC) - # │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT) - - cron: '23 02 * * *' # run once a day, timezone is utc - - workflow_dispatch: - -concurrency: - # Allow only one workflow per any non-`main` branch. - group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }} - cancel-in-progress: true - -jobs: - test-postgres-client-libs: - # TODO: switch to gen2 runner, requires docker - runs-on: ubuntu-22.04 - - env: - DEFAULT_PG_VERSION: 14 - TEST_OUTPUT: /tmp/test_output - - steps: - - name: Checkout - uses: actions/checkout@v4 - - - uses: actions/setup-python@v4 - with: - python-version: 3.9 - - - name: Install Poetry - uses: snok/install-poetry@v1 - - - name: Cache poetry deps - uses: actions/cache@v4 - with: - path: ~/.cache/pypoetry/virtualenvs - key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }} - - - name: Install Python deps - shell: bash -euxo pipefail {0} - run: ./scripts/pysync - - - name: Create Neon Project - id: create-neon-project - uses: ./.github/actions/neon-project-create - with: - api_key: ${{ secrets.NEON_STAGING_API_KEY }} - postgres_version: ${{ env.DEFAULT_PG_VERSION }} - - - name: Run pytest - env: - REMOTE_ENV: 1 - BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }} - POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install - shell: bash -euxo pipefail {0} - run: | - # Test framework expects we have psql binary; - # but since we don't really need it in this test, let's mock it - mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql"; - ./scripts/pytest \ - --junitxml=$TEST_OUTPUT/junit.xml \ - --tb=short \ - --verbose \ - -m "remote_cluster" \ - -rA "test_runner/pg_clients" - - - name: Delete Neon Project - if: ${{ always() }} - uses: ./.github/actions/neon-project-delete - with: - project_id: ${{ steps.create-neon-project.outputs.project_id }} - api_key: ${{ secrets.NEON_STAGING_API_KEY }} - - # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI. - # It will be fixed after switching to gen2 runner - - name: Upload python test logs - if: always() - uses: actions/upload-artifact@v4 - with: - retention-days: 7 - name: python-test-pg_clients-${{ runner.os }}-stage-logs - path: ${{ env.TEST_OUTPUT }} - - - name: Post to a Slack channel - if: ${{ github.event.schedule && failure() }} - uses: slackapi/slack-github-action@v1 - with: - channel-id: "C033QLM5P7D" # dev-staging-stream - slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" - env: - SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }} diff --git a/Cargo.lock b/Cargo.lock index 5393538c5902..776d95c3c745 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1397,9 +1397,9 @@ dependencies = [ [[package]] name = "crc32c" -version = "0.6.5" +version = "0.6.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" dependencies = [ "rustc_version", ] @@ -1651,6 +1651,16 @@ dependencies = [ "rusticata-macros", ] +[[package]] +name = "deranged" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4" +dependencies = [ + "powerfmt", + "serde", +] + [[package]] name = "desim" version = "0.1.0" @@ -3008,9 +3018,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771" [[package]] name = "measured" -version = "0.0.21" +version = "0.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5" +checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0" dependencies = [ "bytes", "crossbeam-utils", @@ -3026,9 +3036,9 @@ dependencies = [ [[package]] name = "measured-derive" -version = "0.0.21" +version = "0.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d" +checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -3038,9 +3048,9 @@ dependencies = [ [[package]] name = "measured-process" -version = "0.0.21" +version = "0.0.22" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000" +checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec" dependencies = [ "libc", "measured", @@ -3275,6 +3285,12 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + [[package]] name = "num-integer" version = "0.1.45" @@ -3667,6 +3683,7 @@ dependencies = [ "sysinfo", "tenant_size_model", "thiserror", + "tikv-jemallocator", "tokio", "tokio-epoll-uring", "tokio-io-timeout", @@ -4077,6 +4094,7 @@ dependencies = [ "tokio-postgres", "tokio-postgres-rustls", "tokio-rustls 0.25.0", + "tokio-util", "tracing", "workspace_hack", ] @@ -4117,6 +4135,12 @@ dependencies = [ "workspace_hack", ] +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + [[package]] name = "ppv-lite86" version = "0.2.17" @@ -5396,9 +5420,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.183" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c" +checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" dependencies = [ "serde_derive", ] @@ -5415,9 +5439,9 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.183" +version = "1.0.203" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816" +checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" dependencies = [ "proc-macro2", "quote", @@ -6107,12 +6131,15 @@ dependencies = [ [[package]] name = "time" -version = "0.3.21" +version = "0.3.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc" +checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885" dependencies = [ + "deranged", "itoa", "js-sys", + "num-conv", + "powerfmt", "serde", "time-core", "time-macros", @@ -6120,16 +6147,17 @@ dependencies = [ [[package]] name = "time-core" -version = "0.1.1" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb" +checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3" [[package]] name = "time-macros" -version = "0.2.9" +version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b" +checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf" dependencies = [ + "num-conv", "time-core", ] @@ -6811,6 +6839,7 @@ dependencies = [ "tokio-stream", "tokio-tar", "tokio-util", + "toml_edit 0.19.10", "tracing", "tracing-error", "tracing-subscriber", @@ -7426,13 +7455,12 @@ dependencies = [ "clap", "clap_builder", "crossbeam-utils", + "deranged", "either", "fail", "futures-channel", - "futures-core", "futures-executor", "futures-io", - "futures-sink", "futures-util", "getrandom 0.2.11", "hashbrown 0.14.5", @@ -7450,7 +7478,9 @@ dependencies = [ "num-traits", "once_cell", "parquet", + "proc-macro2", "prost", + "quote", "rand 0.8.5", "regex", "regex-automata 0.4.3", @@ -7467,6 +7497,7 @@ dependencies = [ "syn 1.0.109", "syn 2.0.52", "sync_wrapper", + "tikv-jemalloc-sys", "time", "time-macros", "tokio", diff --git a/Cargo.toml b/Cargo.toml index 8fddaaef12dd..fc3dd5180922 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -111,8 +111,8 @@ lasso = "0.7" leaky-bucket = "1.0.1" libc = "0.2" md5 = "0.7.0" -measured = { version = "0.0.21", features=["lasso"] } -measured-process = { version = "0.0.21" } +measured = { version = "0.0.22", features=["lasso"] } +measured-process = { version = "0.0.22" } memoffset = "0.8" nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] } notify = "6.0.0" diff --git a/Dockerfile b/Dockerfile index b4900d4a94a1..a41598ef72cd 100644 --- a/Dockerfile +++ b/Dockerfile @@ -42,12 +42,13 @@ ARG CACHEPOT_BUCKET=neon-github-dev COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server +COPY --from=pg-build /home/nonroot/pg_install/v16/lib pg_install/v16/lib COPY --chown=nonroot . . # Show build caching stats to check if it was used in the end. # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats. RUN set -e \ - && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \ + && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \ --bin pg_sni_router \ --bin pageserver \ --bin pagectl \ @@ -56,6 +57,7 @@ RUN set -e \ --bin storage_controller \ --bin proxy \ --bin neon_local \ + --bin storage_scrubber \ --locked --release \ && cachepot -s @@ -82,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy /usr/local/bin COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local /usr/local/bin +COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber /usr/local/bin COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/ COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/ diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools index f85706ef6a44..4826b7914e42 100644 --- a/Dockerfile.build-tools +++ b/Dockerfile.build-tools @@ -1,5 +1,13 @@ FROM debian:bullseye-slim +# Use ARG as a build-time environment variable here to allow. +# It's not supposed to be set outside. +# Alternatively it can be obtained using the following command +# ``` +# . /etc/os-release && echo "${VERSION_CODENAME}" +# ``` +ARG DEBIAN_VERSION_CODENAME=bullseye + # Add nonroot user RUN useradd -ms /bin/bash nonroot -b /home SHELL ["/bin/bash", "-c"] @@ -26,7 +34,6 @@ RUN set -e \ liblzma-dev \ libncurses5-dev \ libncursesw5-dev \ - libpq-dev \ libreadline-dev \ libseccomp-dev \ libsqlite3-dev \ @@ -67,12 +74,24 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/ # LLVM ENV LLVM_VERSION=18 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \ - && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ + && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \ && apt update \ && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \ && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* +# Install docker +RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \ + && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \ + && apt update \ + && apt install -y docker-ce docker-ce-cli \ + && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* + +# Configure sudo & docker +RUN usermod -aG sudo nonroot && \ + echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \ + usermod -aG docker nonroot + # AWS CLI RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \ && unzip -q awscliv2.zip \ diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index a79b666409ae..eced6fc0b2e7 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -798,7 +798,11 @@ impl ComputeNode { // In this case we need to connect with old `zenith_admin` name // and create new user. We cannot simply rename connected user, // but we can create a new one and grant it all privileges. - let connstr = self.connstr.clone(); + let mut connstr = self.connstr.clone(); + connstr + .query_pairs_mut() + .append_pair("application_name", "apply_config"); + let mut client = match Client::connect(connstr.as_str(), NoTls) { Err(e) => match e.code() { Some(&SqlState::INVALID_PASSWORD) @@ -867,15 +871,19 @@ impl ComputeNode { // Run migrations separately to not hold up cold starts thread::spawn(move || { + let mut connstr = connstr.clone(); + connstr + .query_pairs_mut() + .append_pair("application_name", "migrations"); + let mut client = Client::connect(connstr.as_str(), NoTls)?; handle_migrations(&mut client).context("apply_config handle_migrations") }); Ok(()) } - // We could've wrapped this around `pg_ctl reload`, but right now we don't use - // `pg_ctl` for start / stop, so this just seems much easier to do as we already - // have opened connection to Postgres and superuser access. + // Wrapped this around `pg_ctl reload`, but right now we don't use + // `pg_ctl` for start / stop. #[instrument(skip_all)] fn pg_reload_conf(&self) -> Result<()> { let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl"); @@ -1387,7 +1395,9 @@ pub fn forward_termination_signal() { let pg_pid = PG_PID.load(Ordering::SeqCst); if pg_pid != 0 { let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32); - // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html - kill(pg_pid, Signal::SIGQUIT).ok(); + // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for + // ROs to get a list of running xacts faster instead of going through the CLOG. + // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals. + kill(pg_pid, Signal::SIGINT).ok(); } } diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 18c228ba5427..543d4462ed1c 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -11,6 +11,7 @@ pub mod logger; pub mod catalog; pub mod compute; pub mod extension_server; +mod migration; pub mod monitor; pub mod params; pub mod pg_helpers; diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs new file mode 100644 index 000000000000..61dcf01c8448 --- /dev/null +++ b/compute_tools/src/migration.rs @@ -0,0 +1,100 @@ +use anyhow::{Context, Result}; +use postgres::Client; +use tracing::info; + +pub(crate) struct MigrationRunner<'m> { + client: &'m mut Client, + migrations: &'m [&'m str], +} + +impl<'m> MigrationRunner<'m> { + pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self { + Self { client, migrations } + } + + fn get_migration_id(&mut self) -> Result { + let query = "SELECT id FROM neon_migration.migration_id"; + let row = self + .client + .query_one(query, &[]) + .context("run_migrations get migration_id")?; + + Ok(row.get::<&str, i64>("id")) + } + + fn update_migration_id(&mut self) -> Result<()> { + let setval = format!( + "UPDATE neon_migration.migration_id SET id={}", + self.migrations.len() + ); + + self.client + .simple_query(&setval) + .context("run_migrations update id")?; + + Ok(()) + } + + fn prepare_migrations(&mut self) -> Result<()> { + let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; + self.client.simple_query(query)?; + + let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; + self.client.simple_query(query)?; + + let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; + self.client.simple_query(query)?; + + let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; + self.client.simple_query(query)?; + + let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; + self.client.simple_query(query)?; + + Ok(()) + } + + pub fn run_migrations(mut self) -> Result<()> { + self.prepare_migrations()?; + + let mut current_migration: usize = self.get_migration_id()? as usize; + let starting_migration_id = current_migration; + + let query = "BEGIN"; + self.client + .simple_query(query) + .context("run_migrations begin")?; + + while current_migration < self.migrations.len() { + let migration = self.migrations[current_migration]; + + if migration.starts_with("-- SKIP") { + info!("Skipping migration id={}", current_migration); + } else { + info!( + "Running migration id={}:\n{}\n", + current_migration, migration + ); + self.client.simple_query(migration).with_context(|| { + format!("run_migration current_migration={}", current_migration) + })?; + } + + current_migration += 1; + } + + self.update_migration_id()?; + + let query = "COMMIT"; + self.client + .simple_query(query) + .context("run_migrations commit")?; + + info!( + "Ran {} migrations", + (self.migrations.len() - starting_migration_id) + ); + + Ok(()) + } +} diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs index 872a3f775070..d7127aac32d4 100644 --- a/compute_tools/src/monitor.rs +++ b/compute_tools/src/monitor.rs @@ -17,7 +17,11 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500); // should be handled gracefully. fn watch_compute_activity(compute: &ComputeNode) { // Suppose that `connstr` doesn't change - let connstr = compute.connstr.as_str(); + let mut connstr = compute.connstr.clone(); + connstr + .query_pairs_mut() + .append_pair("application_name", "compute_activity_monitor"); + let connstr = connstr.as_str(); // During startup and configuration we connect to every Postgres database, // but we don't want to count this as some user activity. So wait until diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs index fa0822748b61..863fa9468ff4 100644 --- a/compute_tools/src/pg_helpers.rs +++ b/compute_tools/src/pg_helpers.rs @@ -489,7 +489,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> /// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions: /// - next line starts with timestamp /// - EOF -/// - no new lines were written for the last second +/// - no new lines were written for the last 100 milliseconds async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> { let mut lines = tokio::io::BufReader::new(stderr).lines(); let timeout_duration = Duration::from_millis(100); diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 143f6c1e5f6f..37090b08fd37 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -10,6 +10,7 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level}; use crate::config; use crate::logger::inlinify; +use crate::migration::MigrationRunner; use crate::params::PG_HBA_ALL_MD5; use crate::pg_helpers::*; @@ -791,69 +792,7 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> { include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"), ]; - let mut func = || { - let query = "CREATE SCHEMA IF NOT EXISTS neon_migration"; - client.simple_query(query)?; - - let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)"; - client.simple_query(query)?; - - let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING"; - client.simple_query(query)?; - - let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin"; - client.simple_query(query)?; - - let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC"; - client.simple_query(query)?; - Ok::<_, anyhow::Error>(()) - }; - func().context("handle_migrations prepare")?; - - let query = "SELECT id FROM neon_migration.migration_id"; - let row = client - .query_one(query, &[]) - .context("handle_migrations get migration_id")?; - let mut current_migration: usize = row.get::<&str, i64>("id") as usize; - let starting_migration_id = current_migration; - - let query = "BEGIN"; - client - .simple_query(query) - .context("handle_migrations begin")?; - - while current_migration < migrations.len() { - let migration = &migrations[current_migration]; - if migration.starts_with("-- SKIP") { - info!("Skipping migration id={}", current_migration); - } else { - info!( - "Running migration id={}:\n{}\n", - current_migration, migration - ); - client.simple_query(migration).with_context(|| { - format!("handle_migrations current_migration={}", current_migration) - })?; - } - current_migration += 1; - } - let setval = format!( - "UPDATE neon_migration.migration_id SET id={}", - migrations.len() - ); - client - .simple_query(&setval) - .context("handle_migrations update id")?; - - let query = "COMMIT"; - client - .simple_query(query) - .context("handle_migrations commit")?; - - info!( - "Ran {} migrations", - (migrations.len() - starting_migration_id) - ); + MigrationRunner::new(client, &migrations).run_migrations()?; Ok(()) } diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index f381337346ff..4bf1b29785e8 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -21,10 +21,8 @@ use pageserver_api::config::{ DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT, DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT, }; -use pageserver_api::controller_api::PlacementPolicy; -use pageserver_api::models::{ - ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo, -}; +use pageserver_api::controller_api::{PlacementPolicy, TenantCreateRequest}; +use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo}; use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId}; use postgres_backend::AuthType; use postgres_connection::parse_host_port; @@ -848,20 +846,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re let allow_multiple = sub_args.get_flag("allow-multiple"); - // If --safekeepers argument is given, use only the listed safekeeper nodes. - let safekeepers = - if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { - let mut safekeepers: Vec = Vec::new(); - for sk_id in safekeepers_str.split(',').map(str::trim) { - let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| { - anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list") - })?); - safekeepers.push(sk_id); - } - safekeepers - } else { - env.safekeepers.iter().map(|sk| sk.id).collect() - }; + // If --safekeepers argument is given, use only the listed + // safekeeper nodes; otherwise all from the env. + let safekeepers = if let Some(safekeepers) = parse_safekeepers(sub_args)? { + safekeepers + } else { + env.safekeepers.iter().map(|sk| sk.id).collect() + }; let endpoint = cplane .endpoints @@ -965,7 +956,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re }) .collect::>() }; - endpoint.reconfigure(pageservers, None).await?; + // If --safekeepers argument is given, use only the listed + // safekeeper nodes; otherwise all from the env. + let safekeepers = parse_safekeepers(sub_args)?; + endpoint.reconfigure(pageservers, None, safekeepers).await?; } "stop" => { let endpoint_id = sub_args @@ -987,6 +981,23 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re Ok(()) } +/// Parse --safekeepers as list of safekeeper ids. +fn parse_safekeepers(sub_args: &ArgMatches) -> Result>> { + if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { + let mut safekeepers: Vec = Vec::new(); + for sk_id in safekeepers_str.split(',').map(str::trim) { + let sk_id = NodeId( + u64::from_str(sk_id) + .map_err(|_| anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list"))?, + ); + safekeepers.push(sk_id); + } + Ok(Some(safekeepers)) + } else { + Ok(None) + } +} + fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> { let (sub_name, sub_args) = match sub_match.subcommand() { Some(ep_subcommand_data) => ep_subcommand_data, @@ -1590,7 +1601,7 @@ fn cli() -> Command { .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.") .arg(endpoint_id_arg.clone()) .arg(endpoint_pageserver_id_arg.clone()) - .arg(safekeepers_arg) + .arg(safekeepers_arg.clone()) .arg(remote_ext_config_args) .arg(create_test_user) .arg(allow_multiple.clone()) @@ -1599,6 +1610,7 @@ fn cli() -> Command { .subcommand(Command::new("reconfigure") .about("Reconfigure the endpoint") .arg(endpoint_pageserver_id_arg) + .arg(safekeepers_arg) .arg(endpoint_id_arg.clone()) .arg(tenant_id_arg.clone()) ) diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index b928bbfc308e..f9bb2da7e7ac 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -499,6 +499,23 @@ impl Endpoint { .join(",") } + /// Map safekeepers ids to the actual connection strings. + fn build_safekeepers_connstrs(&self, sk_ids: Vec) -> Result> { + let mut safekeeper_connstrings = Vec::new(); + if self.mode == ComputeMode::Primary { + for sk_id in sk_ids { + let sk = self + .env + .safekeepers + .iter() + .find(|node| node.id == sk_id) + .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?; + safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port())); + } + } + Ok(safekeeper_connstrings) + } + pub async fn start( &self, auth_token: &Option, @@ -523,18 +540,7 @@ impl Endpoint { let pageserver_connstring = Self::build_pageserver_connstr(&pageservers); assert!(!pageserver_connstring.is_empty()); - let mut safekeeper_connstrings = Vec::new(); - if self.mode == ComputeMode::Primary { - for sk_id in safekeepers { - let sk = self - .env - .safekeepers - .iter() - .find(|node| node.id == sk_id) - .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?; - safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port())); - } - } + let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?; // check for file remote_extensions_spec.json // if it is present, read it and pass to compute_ctl @@ -740,6 +746,7 @@ impl Endpoint { &self, mut pageservers: Vec<(Host, u16)>, stripe_size: Option, + safekeepers: Option>, ) -> Result<()> { let mut spec: ComputeSpec = { let spec_path = self.endpoint_path().join("spec.json"); @@ -774,6 +781,12 @@ impl Endpoint { spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize); } + // If safekeepers are not specified, don't change them. + if let Some(safekeepers) = safekeepers { + let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?; + spec.safekeeper_connstrings = safekeeper_connstrings; + } + let client = reqwest::Client::builder() .timeout(Duration::from_secs(30)) .build() diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index 6634274d2a55..3ac3ce21df8f 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -325,11 +325,16 @@ impl LocalEnv { } } + pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result { + Ok(self.pg_distrib_dir(pg_version)?.join(dir_name)) + } + pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result { - Ok(self.pg_distrib_dir(pg_version)?.join("bin")) + self.pg_dir(pg_version, "bin") } + pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result { - Ok(self.pg_distrib_dir(pg_version)?.join("lib")) + self.pg_dir(pg_version, "lib") } pub fn pageserver_bin(&self) -> PathBuf { diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs index da4b98784915..f0403b179622 100644 --- a/control_plane/src/pageserver.rs +++ b/control_plane/src/pageserver.rs @@ -15,10 +15,8 @@ use std::time::Duration; use anyhow::{bail, Context}; use camino::Utf8PathBuf; -use futures::SinkExt; use pageserver_api::models::{ - self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, - TimelineInfo, + self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo, }; use pageserver_api::shard::TenantShardId; use pageserver_client::mgmt_api; @@ -397,28 +395,6 @@ impl PageServerNode { } } - pub async fn tenant_create( - &self, - new_tenant_id: TenantId, - generation: Option, - settings: HashMap<&str, &str>, - ) -> anyhow::Result { - let config = Self::parse_config(settings.clone())?; - - let request = models::TenantCreateRequest { - new_tenant_id: TenantShardId::unsharded(new_tenant_id), - generation, - config, - shard_parameters: ShardParameters::default(), - // Placement policy is not meaningful for creations not done via storage controller - placement_policy: None, - }; - if !settings.is_empty() { - bail!("Unrecognized tenant settings: {settings:?}") - } - Ok(self.http_client.tenant_create(&request).await?) - } - pub async fn tenant_config( &self, tenant_id: TenantId, @@ -589,60 +565,39 @@ impl PageServerNode { pg_wal: Option<(Lsn, PathBuf)>, pg_version: u32, ) -> anyhow::Result<()> { - let (client, conn) = self.page_server_psql_client().await?; - // The connection object performs the actual communication with the database, - // so spawn it off to run on its own. - tokio::spawn(async move { - if let Err(e) = conn.await { - eprintln!("connection error: {}", e); - } - }); - let client = std::pin::pin!(client); - // Init base reader let (start_lsn, base_tarfile_path) = base; let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?; - let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile); + let base_tarfile = + mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile)); // Init wal reader if necessary let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal { let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?; - let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile); + let wal_reader = + mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile)); (end_lsn, Some(wal_reader)) } else { (start_lsn, None) }; - let copy_in = |reader, cmd| { - let client = &client; - async move { - let writer = client.copy_in(&cmd).await?; - let writer = std::pin::pin!(writer); - let mut writer = writer.sink_map_err(|e| { - std::io::Error::new(std::io::ErrorKind::Other, format!("{e}")) - }); - let mut reader = std::pin::pin!(reader); - writer.send_all(&mut reader).await?; - writer.into_inner().finish().await?; - anyhow::Ok(()) - } - }; - // Import base - copy_in( - base_tarfile, - format!( - "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}" - ), - ) - .await?; - // Import wal if necessary - if let Some(wal_reader) = wal_reader { - copy_in( - wal_reader, - format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"), + self.http_client + .import_basebackup( + tenant_id, + timeline_id, + start_lsn, + end_lsn, + pg_version, + base_tarfile, ) .await?; + + // Import wal if necessary + if let Some(wal_reader) = wal_reader { + self.http_client + .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader) + .await?; } Ok(()) diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 1c56d5f80fe4..47103a2e0ac5 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -5,12 +5,11 @@ use crate::{ use camino::{Utf8Path, Utf8PathBuf}; use pageserver_api::{ controller_api::{ - NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse, - TenantShardMigrateRequest, TenantShardMigrateResponse, + NodeConfigureRequest, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse, + TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse, }, models::{ - TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse, - TimelineCreateRequest, TimelineInfo, + TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo, }, shard::{ShardStripeSize, TenantShardId}, }; @@ -156,16 +155,16 @@ impl StorageController { .expect("non-Unicode path") } - /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl` + /// Find the directory containing postgres subdirectories, such `bin` and `lib` /// /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back /// to other versions if that one isn't found. Some automated tests create circumstances /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`. - pub async fn get_pg_bin_dir(&self) -> anyhow::Result { + async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result { let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14]; for v in prefer_versions { - let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap(); + let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap(); if tokio::fs::try_exists(&path).await? { return Ok(path); } @@ -173,11 +172,20 @@ impl StorageController { // Fall through anyhow::bail!( - "Postgres binaries not found in {}", - self.env.pg_distrib_dir.display() + "Postgres directory '{}' not found in {}", + dir_name, + self.env.pg_distrib_dir.display(), ); } + pub async fn get_pg_bin_dir(&self) -> anyhow::Result { + self.get_pg_dir("bin").await + } + + pub async fn get_pg_lib_dir(&self) -> anyhow::Result { + self.get_pg_dir("lib").await + } + /// Readiness check for our postgres process async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result { let bin_path = pg_bin_dir.join("pg_isready"); @@ -230,12 +238,17 @@ impl StorageController { .unwrap() .join("storage_controller_db"); let pg_bin_dir = self.get_pg_bin_dir().await?; + let pg_lib_dir = self.get_pg_lib_dir().await?; let pg_log_path = pg_data_path.join("postgres.log"); if !tokio::fs::try_exists(&pg_data_path).await? { // Initialize empty database let initdb_path = pg_bin_dir.join("initdb"); let mut child = Command::new(&initdb_path) + .envs(vec![ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ]) .args(["-D", pg_data_path.as_ref()]) .spawn() .expect("Failed to spawn initdb"); @@ -270,7 +283,10 @@ impl StorageController { &self.env.base_data_dir, pg_bin_dir.join("pg_ctl").as_std_path(), db_start_args, - [], + vec![ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ], background_process::InitialPidFile::Create(self.postgres_pid_file()), retry_timeout, || self.pg_isready(&pg_bin_dir), @@ -325,7 +341,10 @@ impl StorageController { &self.env.base_data_dir, &self.env.storage_controller_bin(), args, - [], + vec![ + ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()), + ], background_process::InitialPidFile::Create(self.pid_file()), retry_timeout, || async { diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs index 775aedb60001..b2c5dfe58a7f 100644 --- a/control_plane/storcon_cli/src/main.rs +++ b/control_plane/storcon_cli/src/main.rs @@ -4,13 +4,13 @@ use std::{str::FromStr, time::Duration}; use clap::{Parser, Subcommand}; use pageserver_api::{ controller_api::{ - NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, + NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest, }, models::{ EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary, - ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest, - TenantShardSplitRequest, TenantShardSplitResponse, + ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest, + TenantShardSplitResponse, }, shard::{ShardStripeSize, TenantShardId}, }; @@ -336,14 +336,18 @@ async fn main() -> anyhow::Result<()> { .await?; } Command::TenantCreate { tenant_id } => { - vps_client - .tenant_create(&TenantCreateRequest { - new_tenant_id: TenantShardId::unsharded(tenant_id), - generation: None, - shard_parameters: ShardParameters::default(), - placement_policy: Some(PlacementPolicy::Attached(1)), - config: TenantConfig::default(), - }) + storcon_client + .dispatch( + Method::POST, + "v1/tenant".to_string(), + Some(TenantCreateRequest { + new_tenant_id: TenantShardId::unsharded(tenant_id), + generation: None, + shard_parameters: ShardParameters::default(), + placement_policy: Some(PlacementPolicy::Attached(1)), + config: TenantConfig::default(), + }), + ) .await?; } Command::TenantDelete { tenant_id } => { diff --git a/docs/rfcs/033-storage-controller-drain-and-fill.md b/docs/rfcs/033-storage-controller-drain-and-fill.md new file mode 100644 index 000000000000..77c84cd2a525 --- /dev/null +++ b/docs/rfcs/033-storage-controller-drain-and-fill.md @@ -0,0 +1,345 @@ +# Graceful Restarts of Storage Controller Managed Clusters + +## Summary +This RFC describes new storage controller APIs for draining and filling tenant shards from/on pageserver nodes. +It also covers how these new APIs should be used by an orchestrator (e.g. Ansible) in order to implement +graceful cluster restarts. + +## Motivation + +Pageserver restarts cause read availablity downtime for tenants. + +For example pageserver-3 @ us-east-1 was unavailable for a randomly +picked tenant (which requested on-demand activation) for around 30 seconds +during the restart at 2024-04-03 16:37 UTC. + +Note that lots of shutdowns on loaded pageservers do not finish within the +[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers +and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse. + +This problem is not yet very acutely felt in storage controller managed pageservers since +tenant density is much lower there. However, we are planning on eventually migrating all +pageservers to storage controller management, so it makes sense to solve the issue proactively. + +## Requirements + +- Pageserver re-deployments cause minimal downtime for tenants +- The storage controller exposes HTTP API hooks for draining and filling tenant shards +from a given pageserver. Said hooks can be used by an orchestrator proces or a human operator. +- The storage controller exposes some HTTP API to cancel draining and filling background operations. +- Failures to drain or fill the node should not be fatal. In such cases, cluster restarts should proceed +as usual (with downtime). +- Progress of draining/filling is visible through metrics + +## Non Goals + +- Integration with the control plane +- Graceful restarts for large non-HA tenants. + +## Impacted Components + +- storage controller +- deployment orchestrator (i.e. Ansible) +- pageserver (indirectly) + +## Terminology + +** Draining ** is the process through which all tenant shards that can be migrated from a given pageserver +are distributed across the rest of the cluster. + +** Filling ** is the symmetric opposite of draining. In this process tenant shards are migrated onto a given +pageserver until the cluster reaches a resonable, quiescent distribution of tenant shards across pageservers. + +** Node scheduling policies ** act as constraints to the scheduler. For instance, when a +node is set in the `Paused` policy, no further shards will be scheduled on it. + +** Node ** is a pageserver. Term is used interchangeably in this RFC. + +** Deployment orchestrator ** is a generic term for whatever drives our deployments. +Currently, it's an Ansible playbook. + +## Background + +### Storage Controller Basics (skip if already familiar) + +Fundamentally, the storage controller is a reconciler which aims to move from the observed mapping between pageservers and tenant shards to an intended mapping. Pageserver nodes and tenant shards metadata is durably persisted in a database, but note that the mapping between the two entities is not durably persisted. Instead, this mapping (*observed state*) is constructed at startup by sending `GET location_config` requests to registered pageservers. + +An internal scheduler maps tenant shards to pageservers while respecting certain constraints. The result of scheduling is the *intent state*. When the intent state changes, a *reconciliation* will inform pageservers about the new assigment via `PUT location_config` requests and will notify the compute via the configured hook. + +### Background Optimizations + +The storage controller performs scheduling optimizations in the background. It will +migrate attachments to warm secondaries and replace secondaries in order to balance +the cluster out. + +### Reconciliations Concurrency Limiting + +There's a hard limit on the number of reconciles that the storage controller +can have in flight at any given time. To get an idea of scales, the limit is +128 at the time of writing. + +## Implementation + +Note: this section focuses on the core functionality of the graceful restart process. +It doesn't neccesarily describe the most efficient approach. Optimizations are described +separately in a later section. + +### Overall Flow + +This section describes how to implement graceful restarts from the perspective +of Ansible, the deployment orchestrator. Pageservers are already restarted sequentially. +The orchestrator shall implement the following epilogue and prologue steps for each +pageserver restart: + +#### Prologue + +The orchestrator shall first fetch the pageserver node id from the control plane or +the pageserver it aims to restart directly. Next, it issues an HTTP request +to the storage controller in order to start the drain of said pageserver node. +All error responses are retried with a short back-off. When a 202 (Accepted) +HTTP code is returned, the drain has started. Now the orchestrator polls the +node status endpoint exposed by the storage controller in order to await the +end of the drain process. When the `policy` field of the node status response +becomes `PauseForRestart`, the drain has completed and the orchestrator can +proceed with restarting the pageserver. + +The prologue is subject to an overall timeout. It will have a value in the ballpark +of minutes. As storage controller managed pageservers become more loaded this timeout +will likely have to increase. + +#### Epilogue + +After restarting the pageserver, the orchestrator issues an HTTP request +to the storage controller to kick off the filling process. This API call +may be retried for all error codes with a short backoff. This also serves +as a synchronization primitive as the fill will be refused if the pageserver +has not yet re-attached to the storage controller. When a 202(Accepted) HTTP +code is returned, the fill has started. Now the orchestrator polls the node +status endpoint exposed by the storage controller in order to await the end of +the filling process. When the `policy` field of the node status response becomes +`Active`, the fill has completed and the orchestrator may proceed to the next pageserver. + +Again, the epilogue is subject to an overall timeout. We can start off with +using the same timeout as for the prologue, but can also consider relying on +the storage controller's background optimizations with a shorter timeout. + +In the case that the deployment orchestrator times out, it attempts to cancel +the fill. This operation shall be retried with a short back-off. If it ultimately +fails it will require manual intervention to set the nodes scheduling policy to +`NodeSchedulingPolicy::Active`. Not doing that is not immediately problematic, +but it constrains the scheduler as mentioned previously. + +### Node Scheduling Policy State Machine + +The state machine below encodes the behaviours discussed above and +the various failover situations described in a later section. + +Assuming no failures and/or timeouts the flow should be: +`Active -> Draining -> PauseForRestart -> Active -> Filling -> Active` + +``` + Operator requested drain + +-----------------------------------------+ + | | + +-------+-------+ +-------v-------+ + | | | | + | Pause | +-----------> Draining +----------+ + | | | | | | + +---------------+ | +-------+-------+ | + | | | + | | | + Drain requested| | | + | |Drain complete | Drain failed + | | | Cancelled/PS reattach/Storcon restart + | | | + +-------+-------+ | | + | | | | + +-------------+ Active <-----------+------------------+ + | | | | +Fill requested | +---^---^-------+ | + | | | | + | | | | + | | | | + | Fill completed| | | + | | |PS reattach | + | | |after restart | + +-------v-------+ | | +-------v-------+ + | | | | | | + | Filling +---------+ +-----------+PauseForRestart| + | | | | + +---------------+ +---------------+ +``` + +### Draining/Filling APIs + +The storage controller API to trigger the draining of a given node is: +`PUT /v1/control/node/:node_id/{drain,fill}`. + +The following HTTP non-success return codes are used. +All of them are safely retriable from the perspective of the storage controller. +- 404: Requested node was not found +- 503: Requested node is known to the storage controller, but unavailable +- 412: Drain precondition failed: there is no other node to drain to or the node's schedulling policy forbids draining +- 409: A {drain, fill} is already in progress. Only one such background operation +is allowed per node. + +When the drain is accepted and commenced a 202 HTTP code is returned. + +Drains and fills shall be cancellable by the deployment orchestrator or a +human operator via: `DELETE /v1/control/node/:node_id/{drain,fill}`. A 200 +response is returned when the cancelation is successful. Errors are retriable. + +### Drain Process + +Before accpeting a drain request the following validations is applied: +* Ensure that the node is known the storage controller +* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active` or `NodeSchedulingPolicy::Pause` +* Ensure that another drain or fill is not already running on the node +* Ensure that a drain is possible (i.e. check that there is at least one +schedulable node to drain to) + +After accepting the drain, the scheduling policy of the node is set to +`NodeSchedulingPolicy::Draining` and persisted in both memory and the database. +This disallows the optimizer from adding or removing shards from the node which +is desirable to avoid them racing. + +Next, a separate Tokio task is spawned to manage the draining. For each tenant +shard attached to the node being drained, demote the node to a secondary and +attempt to schedule the node away. Scheduling might fail due to unsatisfiable +constraints, but that is fine. Draining is a best effort process since it might +not always be possible to cut over all shards. + +Importantly, this task manages the concurrency of issued reconciles in order to +avoid drowning out the target pageservers and to allow other important reconciles +to proceed. + +Once the triggered reconciles have finished or timed out, set the node's scheduling +policy to `NodeSchedulingPolicy::PauseForRestart` to signal the end of the drain. + +A note on non HA tenants: These tenants do not have secondaries, so by the description +above, they would not be migrated. It makes sense to skip them (especially the large ones) +since, depending on tenant size, this might be more disruptive than the restart since the +pageserver we've moved to do will need to on-demand download the entire working set for the tenant. +We can consider expanding to small non-HA tenants in the future. + +### Fill Process + +Before accpeting a fill request the following validations is applied: +* Ensure that the node is known the storage controller +* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active`. +This is the only acceptable policy for the fill starting state. When a node re-attaches, +it set the scheduling policy to `NodeSchedulingPolicy::Active` if it was equal to +`NodeSchedulingPolicy::PauseForRestart` or `NodeSchedulingPolicy::Draining` (possible end states for a node drain). +* Ensure that another drain or fill is not already running on the node + +After accepting the drain, the scheduling policy of the node is set to +`NodeSchedulingPolicy::Filling` and persisted in both memory and the database. +This disallows the optimizer from adding or removing shards from the node which +is desirable to avoid them racing. + +Next, a separate Tokio task is spawned to manage the draining. For each tenant +shard where the filled node is a secondary, promote the secondary. This is done +until we run out of shards or the counts of attached shards become balanced across +the cluster. + +Like for draining, the concurrency of spawned reconciles is limited. + +### Failure Modes & Handling + +Failures are generally handled by transition back into the `Active` +(neutral) state. This simplifies the implementation greatly at the +cost of adding transitions to the state machine. For example, we +could detect the `Draining` state upon restart and proceed with a drain, +but how should the storage controller know that's what the orchestrator +needs still? + +#### Storage Controller Crash + +When the storage controller starts up reset the node scheduling policy +of all nodes in states `Draining`, `Filling` or `PauseForRestart` to +`Active`. The rationale is that when the storage controller restarts, +we have lost context of what the deployment orchestrator wants. It also +has the benefit of making things easier to reason about. + +#### Pageserver Crash During Drain + +The pageserver will attempt to re-attach during restart at which +point the node scheduling policy will be set back to `Active`, thus +reenabling the scheduler to use the node. + +#### Non-drained Pageserver Crash During Drain + +What should happen when a pageserver we are draining to crashes during the +process. Two reasonable options are: cancel the drain and focus on the failover +*or* do both, but prioritise failover. Since the number of concurrent reconciles +produced by drains/fills are limited, we get the later behaviour for free. +My suggestion is we take this approach, but the cancellation option is trivial +to implement as well. + +#### Pageserver Crash During Fill + +The pageserver will attempt to re-attach during restart at which +point the node scheduling policy will be set back to `Active`, thus +reenabling the scheduler to use the node. + +#### Pageserver Goes unavailable During Drain/Fill + +The drain and fill jobs handle this by stopping early. When the pageserver +is detected as online by storage controller heartbeats, reset its scheduling +policy to `Active`. If a restart happens instead, see the pageserver crash +failure mode. + +#### Orchestrator Drain Times Out + +Orchestrator will still proceed with the restart. +When the pageserver re-attaches, the scheduling policy is set back to +`Active`. + +#### Orchestrator Fill Times Out + +Orchestrator will attempt to cancel the fill operation. If that fails, +the fill will continue until it quiesces and the node will be left +in the `Filling` scheduling policy. This hinders the scheduler, but is +otherwise harmless. A human operator can handle this by setting the scheduling +policy to `Active`, or we can bake in a fill timeout into the storage controller. + +## Optimizations + +### Location Warmth + +When cutting over to a secondary, the storage controller will wait for it to +become "warm" (i.e. download enough of the tenants data). This means that some +reconciliations can take significantly longer than others and hold up precious +reconciliations units. As an optimization, the drain stage can only cut over +tenants that are already "warm". Similarly, the fill stage can prioritise the +"warmest" tenants in the fill. + +Given that the number of tenants by the storage controller will be fairly low +for the foreseable future, the first implementation could simply query the tenants +for secondary status. This doesn't scale well with increasing tenant counts, so +eventually we will need new pageserver API endpoints to report the sets of +"warm" and "cold" nodes. + +## Alternatives Considered + +### Draining and Filling Purely as Scheduling Constraints + +At its core, the storage controller is a big background loop that detects changes +in the environment and reacts on them. One could express draining and filling +of nodes purely in terms of constraining the scheduler (as opposed to having +such background tasks). + +While theoretically nice, I think that's harder to implement and more importantly operate and reason about. +Consider cancellation of a drain/fill operation. We would have to update the scheduler state, create +an entirely new schedule (intent state) and start work on applying that. It gets trickier if we wish +to cancel the reconciliation tasks spawned by drain/fill nodes. How would we know which ones belong +to the conceptual drain/fill? One could add labels to reconciliations, but it gets messy in my opinion. + +It would also mean that reconciliations themselves have side effects that persist in the database +(persist something to the databse when the drain is done), which I'm not conceptually fond of. + +## Proof of Concept + +This RFC is accompanied by a POC which implements nearly everything mentioned here +apart from the optimizations and some of the failure handling: +https://github.com/neondatabase/neon/pull/7682 diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs index f53511ab5cc3..723916a7421a 100644 --- a/libs/metrics/src/hll.rs +++ b/libs/metrics/src/hll.rs @@ -13,11 +13,7 @@ use std::{ use measured::{ label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor}, - metric::{ - group::{Encoding, MetricValue}, - name::MetricNameEncoder, - Metric, MetricType, MetricVec, - }, + metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec}, text::TextEncoder, LabelGroup, }; @@ -144,6 +140,7 @@ impl HyperLogLogState { }) } } + impl measured::metric::MetricEncoding> for HyperLogLogState { @@ -182,12 +179,13 @@ impl measured::metric::MetricEncoding = Lazy::new(|| { .expect("Failed to register maxrss_kb int gauge") }); -pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[ - 0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5, -]; +/// Most common fsync latency is 50 µs - 100 µs, but it can be much higher, +/// especially during many concurrent disk operations. +pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] = + &[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0]; pub struct BuildInfo { pub revision: &'static str, @@ -170,8 +171,11 @@ fn write_gauge( labels: impl LabelGroup, name: impl MetricNameEncoder, enc: &mut Enc, -) -> Result<(), Enc::Err> { - enc.write_metric_value(name, labels, MetricValue::Int(x)) +) -> Result<(), Enc::Err> +where + GaugeState: MetricEncoding, +{ + GaugeState::new(x).collect_into(&(), labels, name, enc) } #[derive(Default)] @@ -543,15 +547,6 @@ impl Encoding for Inc { fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { self.0.write_help(name, help) } - - fn write_metric_value( - &mut self, - name: impl MetricNameEncoder, - labels: impl LabelGroup, - value: MetricValue, - ) -> Result<(), Self::Err> { - self.0.write_metric_value(name, labels, value) - } } impl MetricEncoding> for MeasuredCounterPairState @@ -578,15 +573,6 @@ impl Encoding for Dec { fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> { self.0.write_help(name, help) } - - fn write_metric_value( - &mut self, - name: impl MetricNameEncoder, - labels: impl LabelGroup, - value: MetricValue, - ) -> Result<(), Self::Err> { - self.0.write_metric_value(name, labels, value) - } } /// Write the dec counter to the encoder diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index a0d10dc665dc..f05c1315eafa 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -11,6 +11,27 @@ use crate::{ shard::{ShardStripeSize, TenantShardId}, }; +#[derive(Serialize, Deserialize, Debug)] +#[serde(deny_unknown_fields)] +pub struct TenantCreateRequest { + pub new_tenant_id: TenantShardId, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub generation: Option, + + // If omitted, create a single shard with TenantShardId::unsharded() + #[serde(default)] + #[serde(skip_serializing_if = "ShardParameters::is_unsharded")] + pub shard_parameters: ShardParameters, + + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub placement_policy: Option, + + #[serde(flatten)] + pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it +} + #[derive(Serialize, Deserialize)] pub struct TenantCreateResponseShard { pub shard_id: TenantShardId, @@ -280,4 +301,19 @@ mod test { assert_eq!(serde_json::from_str::(&encoded)?, v); Ok(()) } + + #[test] + fn test_reject_unknown_field() { + let id = TenantId::generate(); + let create_request = serde_json::json!({ + "new_tenant_id": id.to_string(), + "unknown_field": "unknown_value".to_string(), + }); + let err = serde_json::from_value::(create_request).unwrap_err(); + assert!( + err.to_string().contains("unknown field `unknown_field`"), + "expect unknown field `unknown_field` error, got: {}", + err + ); + } } diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index cd430bfab7d4..0acd83753eff 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -29,7 +29,7 @@ pub const KEY_SIZE: usize = 18; /// See [`Key::to_i128`] for more information on the encoding. pub const METADATA_KEY_SIZE: usize = 16; -/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key. +/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key. pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60; pub const METADATA_KEY_END_PREFIX: u8 = 0x7F; diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs index 9a61f2ad81ae..401887d3629c 100644 --- a/libs/pageserver_api/src/keyspace.rs +++ b/libs/pageserver_api/src/keyspace.rs @@ -17,6 +17,16 @@ pub struct KeySpace { pub ranges: Vec>, } +impl std::fmt::Display for KeySpace { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "[")?; + for range in &self.ranges { + write!(f, "{}..{},", range.start, range.end)?; + } + write!(f, "]") + } +} + /// A wrapper type for sparse keyspaces. #[derive(Clone, Debug, Default, PartialEq, Eq)] pub struct SparseKeySpace(pub KeySpace); diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs index 4875f4949522..d360cc6e870f 100644 --- a/libs/pageserver_api/src/models.rs +++ b/libs/pageserver_api/src/models.rs @@ -9,6 +9,7 @@ use std::{ collections::HashMap, io::{BufRead, Read}, num::{NonZeroU64, NonZeroUsize}, + str::FromStr, sync::atomic::AtomicUsize, time::{Duration, SystemTime}, }; @@ -25,7 +26,6 @@ use utils::{ serde_system_time, }; -use crate::controller_api::PlacementPolicy; use crate::{ reltag::RelTag, shard::{ShardCount, ShardStripeSize, TenantShardId}, @@ -229,6 +229,11 @@ pub struct TimelineCreateRequest { pub pg_version: Option, } +#[derive(Serialize, Deserialize, Clone)] +pub struct LsnLeaseRequest { + pub lsn: Lsn, +} + #[derive(Serialize, Deserialize)] pub struct TenantShardSplitRequest { pub new_shard_count: u8, @@ -271,28 +276,6 @@ impl Default for ShardParameters { } } -#[derive(Serialize, Deserialize, Debug)] -#[serde(deny_unknown_fields)] -pub struct TenantCreateRequest { - pub new_tenant_id: TenantShardId, - #[serde(default)] - #[serde(skip_serializing_if = "Option::is_none")] - pub generation: Option, - - // If omitted, create a single shard with TenantShardId::unsharded() - #[serde(default)] - #[serde(skip_serializing_if = "ShardParameters::is_unsharded")] - pub shard_parameters: ShardParameters, - - // This parameter is only meaningful in requests sent to the storage controller - #[serde(default)] - #[serde(skip_serializing_if = "Option::is_none")] - pub placement_policy: Option, - - #[serde(flatten)] - pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it -} - /// An alternative representation of `pageserver::tenant::TenantConf` with /// simpler types. #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)] @@ -455,6 +438,41 @@ pub enum CompactionAlgorithm { Tiered, } +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +pub enum ImageCompressionAlgorithm { + // Disabled for writes, support decompressing during read path + Disabled, + /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well. + /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html). + Zstd { + level: Option, + }, +} + +impl FromStr for ImageCompressionAlgorithm { + type Err = anyhow::Error; + fn from_str(s: &str) -> Result { + let mut components = s.split(['(', ')']); + let first = components + .next() + .ok_or_else(|| anyhow::anyhow!("empty string"))?; + match first { + "disabled" => Ok(ImageCompressionAlgorithm::Disabled), + "zstd" => { + let level = if let Some(v) = components.next() { + let v: i8 = v.parse()?; + Some(v) + } else { + None + }; + + Ok(ImageCompressionAlgorithm::Zstd { level }) + } + _ => anyhow::bail!("invalid specifier '{first}'"), + } + } +} + #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)] pub struct CompactionAlgorithmSettings { pub kind: CompactionAlgorithm, @@ -547,10 +565,6 @@ pub struct LocationConfigListResponse { pub tenant_shards: Vec<(TenantShardId, Option)>, } -#[derive(Serialize, Deserialize)] -#[serde(transparent)] -pub struct TenantCreateResponse(pub TenantId); - #[derive(Serialize)] pub struct StatusResponse { pub id: NodeId, @@ -670,6 +684,16 @@ pub struct TimelineInfo { pub current_physical_size: Option, // is None when timeline is Unloaded pub current_logical_size_non_incremental: Option, + /// How many bytes of WAL are within this branch's pitr_interval. If the pitr_interval goes + /// beyond the branch's branch point, we only count up to the branch point. + pub pitr_history_size: u64, + + /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any + /// ancestor data used by this branch would have been retained anyway). If this is false, then + /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would + /// otherwise be able to GC. + pub within_ancestor_pitr: bool, + pub timeline_dir_layer_file_size_sum: Option, pub wal_source_connstr: Option, @@ -1507,18 +1531,6 @@ mod tests { #[test] fn test_reject_unknown_field() { - let id = TenantId::generate(); - let create_request = json!({ - "new_tenant_id": id.to_string(), - "unknown_field": "unknown_value".to_string(), - }); - let err = serde_json::from_value::(create_request).unwrap_err(); - assert!( - err.to_string().contains("unknown field `unknown_field`"), - "expect unknown field `unknown_field` error, got: {}", - err - ); - let id = TenantId::generate(); let config_request = json!({ "tenant_id": id.to_string(), @@ -1653,4 +1665,25 @@ mod tests { AuxFilePolicy::CrossValidation ); } + + #[test] + fn test_image_compression_algorithm_parsing() { + use ImageCompressionAlgorithm::*; + assert_eq!( + ImageCompressionAlgorithm::from_str("disabled").unwrap(), + Disabled + ); + assert_eq!( + ImageCompressionAlgorithm::from_str("zstd").unwrap(), + Zstd { level: None } + ); + assert_eq!( + ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(), + Zstd { level: Some(18) } + ); + assert_eq!( + ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(), + Zstd { level: Some(-3) } + ); + } } diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs index 8c5a4e616869..e83cf4c855a1 100644 --- a/libs/pageserver_api/src/shard.rs +++ b/libs/pageserver_api/src/shard.rs @@ -1,59 +1,42 @@ -use std::{ops::RangeInclusive, str::FromStr}; +//! See docs/rfcs/031-sharding-static.md for an overview of sharding. +//! +//! This module contains a variety of types used to represent the concept of sharding +//! a Neon tenant across multiple physical shards. Since there are quite a few of these, +//! we provide an summary here. +//! +//! Types used to describe shards: +//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value +//! which identifies a tenant which is not shard-aware. This means its storage paths do not include +//! a shard suffix. +//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant. +//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId` +//! without the tenant ID. This is useful for things that are implicitly scoped to a particular +//! tenant, such as layer files. +//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient +//! detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read. +//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as +//! four hex digits. An unsharded tenant is `0000`. +//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant +//! +//! Types used to describe the parameters for data distribution in a sharded tenant: +//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across +//! multiple shards. Its value is given in 8kiB pages. +//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is +//! always zero: this is provided for future upgrades that might introduce different +//! data distribution schemes. +//! +//! Examples: +//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000 +//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001 +//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive), +//! and their slugs are 0004, 0104, 0204, and 0304. use crate::{key::Key, models::ShardParameters}; -use hex::FromHex; use postgres_ffi::relfile_utils::INIT_FORKNUM; use serde::{Deserialize, Serialize}; -use utils::id::TenantId; -/// See docs/rfcs/031-sharding-static.md for an overview of sharding. -/// -/// This module contains a variety of types used to represent the concept of sharding -/// a Neon tenant across multiple physical shards. Since there are quite a few of these, -/// we provide an summary here. -/// -/// Types used to describe shards: -/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value -/// which identifies a tenant which is not shard-aware. This means its storage paths do not include -/// a shard suffix. -/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant. -/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId` -/// without the tenant ID. This is useful for things that are implicitly scoped to a particular -/// tenant, such as layer files. -/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient -/// detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read. -/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as -/// four hex digits. An unsharded tenant is `0000`. -/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant -/// -/// Types used to describe the parameters for data distribution in a sharded tenant: -/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across -/// multiple shards. Its value is given in 8kiB pages. -/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is -/// always zero: this is provided for future upgrades that might introduce different -/// data distribution schemes. -/// -/// Examples: -/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000 -/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001 -/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive), -/// and their slugs are 0004, 0104, 0204, and 0304. - -#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] -pub struct ShardNumber(pub u8); - -#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] -pub struct ShardCount(u8); - -/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant, -/// when we need to know which shard we're dealing with, but do not need to know the full -/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know -/// the fully qualified TenantShardId. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct ShardIndex { - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} +#[doc(inline)] +pub use ::utils::shard::*; /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`], /// and to check whether that [`ShardNumber`] is the same as the current shard. @@ -65,362 +48,6 @@ pub struct ShardIdentity { layout: ShardLayout, } -/// Formatting helper, for generating the `shard_id` label in traces. -struct ShardSlug<'a>(&'a TenantShardId); - -/// TenantShardId globally identifies a particular shard in a particular tenant. -/// -/// These are written as `-`, for example: -/// # The second shard in a two-shard tenant -/// 072f1291a5310026820b2fe4b2968934-0102 -/// -/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without -/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables -/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`]. -/// -/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs, -/// is both forward and backward compatible with TenantId: a legacy TenantId can be -/// decoded as a TenantShardId, and when re-encoded it will be parseable -/// as a TenantId. -#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] -pub struct TenantShardId { - pub tenant_id: TenantId, - pub shard_number: ShardNumber, - pub shard_count: ShardCount, -} - -impl ShardCount { - pub const MAX: Self = Self(u8::MAX); - - /// The internal value of a ShardCount may be zero, which means "1 shard, but use - /// legacy format for TenantShardId that excludes the shard suffix", also known - /// as [`TenantShardId::unsharded`]. - /// - /// This method returns the actual number of shards, i.e. if our internal value is - /// zero, we return 1 (unsharded tenants have 1 shard). - pub fn count(&self) -> u8 { - if self.0 > 0 { - self.0 - } else { - 1 - } - } - - /// The literal internal value: this is **not** the number of shards in the - /// tenant, as we have a special zero value for legacy unsharded tenants. Use - /// [`Self::count`] if you want to know the cardinality of shards. - pub fn literal(&self) -> u8 { - self.0 - } - - /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but - /// uses the legacy format for `TenantShardId`. See also the documentation for - /// [`Self::count`]. - pub fn is_unsharded(&self) -> bool { - self.0 == 0 - } - - /// `v` may be zero, or the number of shards in the tenant. `v` is what - /// [`Self::literal`] would return. - pub const fn new(val: u8) -> Self { - Self(val) - } -} - -impl ShardNumber { - pub const MAX: Self = Self(u8::MAX); -} - -impl TenantShardId { - pub fn unsharded(tenant_id: TenantId) -> Self { - Self { - tenant_id, - shard_number: ShardNumber(0), - shard_count: ShardCount(0), - } - } - - /// The range of all TenantShardId that belong to a particular TenantId. This is useful when - /// you have a BTreeMap of TenantShardId, and are querying by TenantId. - pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive { - RangeInclusive::new( - Self { - tenant_id, - shard_number: ShardNumber(0), - shard_count: ShardCount(0), - }, - Self { - tenant_id, - shard_number: ShardNumber::MAX, - shard_count: ShardCount::MAX, - }, - ) - } - - pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { - ShardSlug(self) - } - - /// Convenience for code that has special behavior on the 0th shard. - pub fn is_shard_zero(&self) -> bool { - self.shard_number == ShardNumber(0) - } - - /// The "unsharded" value is distinct from simply having a single shard: it represents - /// a tenant which is not shard-aware at all, and whose storage paths will not include - /// a shard suffix. - pub fn is_unsharded(&self) -> bool { - self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() - } - - /// Convenience for dropping the tenant_id and just getting the ShardIndex: this - /// is useful when logging from code that is already in a span that includes tenant ID, to - /// keep messages reasonably terse. - pub fn to_index(&self) -> ShardIndex { - ShardIndex { - shard_number: self.shard_number, - shard_count: self.shard_count, - } - } - - /// Calculate the children of this TenantShardId when splitting the overall tenant into - /// the given number of shards. - pub fn split(&self, new_shard_count: ShardCount) -> Vec { - let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1); - let mut child_shards = Vec::new(); - for shard_number in 0..ShardNumber(new_shard_count.0).0 { - // Key mapping is based on a round robin mapping of key hash modulo shard count, - // so our child shards are the ones which the same keys would map to. - if shard_number % effective_old_shard_count == self.shard_number.0 { - child_shards.push(TenantShardId { - tenant_id: self.tenant_id, - shard_number: ShardNumber(shard_number), - shard_count: new_shard_count, - }) - } - } - - child_shards - } -} - -impl<'a> std::fmt::Display for ShardSlug<'a> { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "{:02x}{:02x}", - self.0.shard_number.0, self.0.shard_count.0 - ) - } -} - -impl std::fmt::Display for TenantShardId { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if self.shard_count != ShardCount(0) { - write!(f, "{}-{}", self.tenant_id, self.shard_slug()) - } else { - // Legacy case (shard_count == 0) -- format as just the tenant id. Note that this - // is distinct from the normal single shard case (shard count == 1). - self.tenant_id.fmt(f) - } - } -} - -impl std::fmt::Debug for TenantShardId { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // Debug is the same as Display: the compact hex representation - write!(f, "{}", self) - } -} - -impl std::str::FromStr for TenantShardId { - type Err = hex::FromHexError; - - fn from_str(s: &str) -> Result { - // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count - if s.len() == 32 { - // Legacy case: no shard specified - Ok(Self { - tenant_id: TenantId::from_str(s)?, - shard_number: ShardNumber(0), - shard_count: ShardCount(0), - }) - } else if s.len() == 37 { - let bytes = s.as_bytes(); - let tenant_id = TenantId::from_hex(&bytes[0..32])?; - let mut shard_parts: [u8; 2] = [0u8; 2]; - hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?; - Ok(Self { - tenant_id, - shard_number: ShardNumber(shard_parts[0]), - shard_count: ShardCount(shard_parts[1]), - }) - } else { - Err(hex::FromHexError::InvalidStringLength) - } - } -} - -impl From<[u8; 18]> for TenantShardId { - fn from(b: [u8; 18]) -> Self { - let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap(); - - Self { - tenant_id: TenantId::from(tenant_id_bytes), - shard_number: ShardNumber(b[16]), - shard_count: ShardCount(b[17]), - } - } -} - -impl ShardIndex { - pub fn new(number: ShardNumber, count: ShardCount) -> Self { - Self { - shard_number: number, - shard_count: count, - } - } - pub fn unsharded() -> Self { - Self { - shard_number: ShardNumber(0), - shard_count: ShardCount(0), - } - } - - /// The "unsharded" value is distinct from simply having a single shard: it represents - /// a tenant which is not shard-aware at all, and whose storage paths will not include - /// a shard suffix. - pub fn is_unsharded(&self) -> bool { - self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) - } - - /// For use in constructing remote storage paths: concatenate this with a TenantId - /// to get a fully qualified TenantShardId. - /// - /// Backward compat: this function returns an empty string if Self::is_unsharded, such - /// that the legacy pre-sharding remote key format is preserved. - pub fn get_suffix(&self) -> String { - if self.is_unsharded() { - "".to_string() - } else { - format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0) - } - } -} - -impl std::fmt::Display for ShardIndex { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0) - } -} - -impl std::fmt::Debug for ShardIndex { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // Debug is the same as Display: the compact hex representation - write!(f, "{}", self) - } -} - -impl std::str::FromStr for ShardIndex { - type Err = hex::FromHexError; - - fn from_str(s: &str) -> Result { - // Expect format: 1 byte shard number, 1 byte shard count - if s.len() == 4 { - let bytes = s.as_bytes(); - let mut shard_parts: [u8; 2] = [0u8; 2]; - hex::decode_to_slice(bytes, &mut shard_parts)?; - Ok(Self { - shard_number: ShardNumber(shard_parts[0]), - shard_count: ShardCount(shard_parts[1]), - }) - } else { - Err(hex::FromHexError::InvalidStringLength) - } - } -} - -impl From<[u8; 2]> for ShardIndex { - fn from(b: [u8; 2]) -> Self { - Self { - shard_number: ShardNumber(b[0]), - shard_count: ShardCount(b[1]), - } - } -} - -impl Serialize for TenantShardId { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - if serializer.is_human_readable() { - serializer.collect_str(self) - } else { - // Note: while human encoding of [`TenantShardId`] is backward and forward - // compatible, this binary encoding is not. - let mut packed: [u8; 18] = [0; 18]; - packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); - packed[16] = self.shard_number.0; - packed[17] = self.shard_count.0; - - packed.serialize(serializer) - } - } -} - -impl<'de> Deserialize<'de> for TenantShardId { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - struct IdVisitor { - is_human_readable_deserializer: bool, - } - - impl<'de> serde::de::Visitor<'de> for IdVisitor { - type Value = TenantShardId; - - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { - if self.is_human_readable_deserializer { - formatter.write_str("value in form of hex string") - } else { - formatter.write_str("value in form of integer array([u8; 18])") - } - } - - fn visit_seq(self, seq: A) -> Result - where - A: serde::de::SeqAccess<'de>, - { - let s = serde::de::value::SeqAccessDeserializer::new(seq); - let id: [u8; 18] = Deserialize::deserialize(s)?; - Ok(TenantShardId::from(id)) - } - - fn visit_str(self, v: &str) -> Result - where - E: serde::de::Error, - { - TenantShardId::from_str(v).map_err(E::custom) - } - } - - if deserializer.is_human_readable() { - deserializer.deserialize_str(IdVisitor { - is_human_readable_deserializer: true, - }) - } else { - deserializer.deserialize_tuple( - 18, - IdVisitor { - is_human_readable_deserializer: false, - }, - ) - } - } -} - /// Stripe size in number of pages #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)] pub struct ShardStripeSize(pub u32); @@ -585,77 +212,6 @@ impl ShardIdentity { } } -impl Serialize for ShardIndex { - fn serialize(&self, serializer: S) -> Result - where - S: serde::Serializer, - { - if serializer.is_human_readable() { - serializer.collect_str(self) - } else { - // Binary encoding is not used in index_part.json, but is included in anticipation of - // switching various structures (e.g. inter-process communication, remote metadata) to more - // compact binary encodings in future. - let mut packed: [u8; 2] = [0; 2]; - packed[0] = self.shard_number.0; - packed[1] = self.shard_count.0; - packed.serialize(serializer) - } - } -} - -impl<'de> Deserialize<'de> for ShardIndex { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - struct IdVisitor { - is_human_readable_deserializer: bool, - } - - impl<'de> serde::de::Visitor<'de> for IdVisitor { - type Value = ShardIndex; - - fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { - if self.is_human_readable_deserializer { - formatter.write_str("value in form of hex string") - } else { - formatter.write_str("value in form of integer array([u8; 2])") - } - } - - fn visit_seq(self, seq: A) -> Result - where - A: serde::de::SeqAccess<'de>, - { - let s = serde::de::value::SeqAccessDeserializer::new(seq); - let id: [u8; 2] = Deserialize::deserialize(s)?; - Ok(ShardIndex::from(id)) - } - - fn visit_str(self, v: &str) -> Result - where - E: serde::de::Error, - { - ShardIndex::from_str(v).map_err(E::custom) - } - } - - if deserializer.is_human_readable() { - deserializer.deserialize_str(IdVisitor { - is_human_readable_deserializer: true, - }) - } else { - deserializer.deserialize_tuple( - 2, - IdVisitor { - is_human_readable_deserializer: false, - }, - ) - } - } -} - /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys /// in order to be able to serve basebackup requests without peer communication). fn key_is_shard0(key: &Key) -> bool { @@ -737,7 +293,9 @@ pub fn describe( #[cfg(test)] mod tests { - use utils::Hex; + use std::str::FromStr; + + use utils::{id::TenantId, Hex}; use super::*; diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml index 8e249c09f7e0..c7611b9f213d 100644 --- a/libs/postgres_backend/Cargo.toml +++ b/libs/postgres_backend/Cargo.toml @@ -13,6 +13,7 @@ rustls.workspace = true serde.workspace = true thiserror.workspace = true tokio.workspace = true +tokio-util.workspace = true tokio-rustls.workspace = true tracing.workspace = true @@ -23,4 +24,4 @@ workspace_hack.workspace = true once_cell.workspace = true rustls-pemfile.workspace = true tokio-postgres.workspace = true -tokio-postgres-rustls.workspace = true \ No newline at end of file +tokio-postgres-rustls.workspace = true diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs index 6c41b7f347a9..7c7c6535b338 100644 --- a/libs/postgres_backend/src/lib.rs +++ b/libs/postgres_backend/src/lib.rs @@ -16,6 +16,7 @@ use std::{fmt, io}; use std::{future::Future, str::FromStr}; use tokio::io::{AsyncRead, AsyncWrite}; use tokio_rustls::TlsAcceptor; +use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, trace, warn}; use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter}; @@ -400,21 +401,15 @@ impl PostgresBackend { } /// Wrapper for run_message_loop() that shuts down socket when we are done - pub async fn run( + pub async fn run( mut self, handler: &mut impl Handler, - shutdown_watcher: F, - ) -> Result<(), QueryError> - where - F: Fn() -> S + Clone, - S: Future, - { - let ret = self - .run_message_loop(handler, shutdown_watcher.clone()) - .await; + cancel: &CancellationToken, + ) -> Result<(), QueryError> { + let ret = self.run_message_loop(handler, cancel).await; tokio::select! { - _ = shutdown_watcher() => { + _ = cancel.cancelled() => { // do nothing; we most likely got already stopped by shutdown and will log it next. } _ = self.framed.shutdown() => { @@ -444,21 +439,17 @@ impl PostgresBackend { } } - async fn run_message_loop( + async fn run_message_loop( &mut self, handler: &mut impl Handler, - shutdown_watcher: F, - ) -> Result<(), QueryError> - where - F: Fn() -> S, - S: Future, - { + cancel: &CancellationToken, + ) -> Result<(), QueryError> { trace!("postgres backend to {:?} started", self.peer_addr); tokio::select!( biased; - _ = shutdown_watcher() => { + _ = cancel.cancelled() => { // We were requested to shut down. tracing::info!("shutdown request received during handshake"); return Err(QueryError::Shutdown) @@ -473,7 +464,7 @@ impl PostgresBackend { let mut query_string = Bytes::new(); while let Some(msg) = tokio::select!( biased; - _ = shutdown_watcher() => { + _ = cancel.cancelled() => { // We were requested to shut down. tracing::info!("shutdown request received in run_message_loop"); return Err(QueryError::Shutdown) @@ -485,7 +476,7 @@ impl PostgresBackend { let result = self.process_message(handler, msg, &mut query_string).await; tokio::select!( biased; - _ = shutdown_watcher() => { + _ = cancel.cancelled() => { // We were requested to shut down. tracing::info!("shutdown request received during response flush"); @@ -672,11 +663,17 @@ impl PostgresBackend { assert!(self.state < ProtoState::Authentication); let have_tls = self.tls_config.is_some(); match msg { - FeStartupPacket::SslRequest => { + FeStartupPacket::SslRequest { direct } => { debug!("SSL requested"); - self.write_message(&BeMessage::EncryptionResponse(have_tls)) - .await?; + if !direct { + self.write_message(&BeMessage::EncryptionResponse(have_tls)) + .await?; + } else if !have_tls { + return Err(QueryError::Other(anyhow::anyhow!( + "direct SSL negotiation but no TLS support" + ))); + } if have_tls { self.start_tls().await?; diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs index 80df9db858a9..7ec85f0dbe90 100644 --- a/libs/postgres_backend/tests/simple_select.rs +++ b/libs/postgres_backend/tests/simple_select.rs @@ -3,13 +3,14 @@ use once_cell::sync::Lazy; use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError}; use pq_proto::{BeMessage, RowDescriptor}; use std::io::Cursor; -use std::{future, sync::Arc}; +use std::sync::Arc; use tokio::io::{AsyncRead, AsyncWrite}; use tokio::net::{TcpListener, TcpStream}; use tokio_postgres::config::SslMode; use tokio_postgres::tls::MakeTlsConnect; use tokio_postgres::{Config, NoTls, SimpleQueryMessage}; use tokio_postgres_rustls::MakeRustlsConnect; +use tokio_util::sync::CancellationToken; // generate client, server test streams async fn make_tcp_pair() -> (TcpStream, TcpStream) { @@ -50,7 +51,7 @@ async fn simple_select() { tokio::spawn(async move { let mut handler = TestHandler {}; - pgbackend.run(&mut handler, future::pending::<()>).await + pgbackend.run(&mut handler, &CancellationToken::new()).await }); let conf = Config::new(); @@ -102,7 +103,7 @@ async fn simple_select_ssl() { tokio::spawn(async move { let mut handler = TestHandler {}; - pgbackend.run(&mut handler, future::pending::<()>).await + pgbackend.run(&mut handler, &CancellationToken::new()).await }); let client_cfg = rustls::ClientConfig::builder() diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs index 0bbb91afc282..d25b23663bf6 100644 --- a/libs/postgres_ffi/src/xlog_utils.rs +++ b/libs/postgres_ffi/src/xlog_utils.rs @@ -356,6 +356,28 @@ impl CheckPoint { } false } + + /// Advance next multi-XID/offset to those given in arguments. + /// + /// It's important that this handles wraparound correctly. This should match the + /// MultiXactAdvanceNextMXact() logic in PostgreSQL's xlog_redo() function. + /// + /// Returns 'true' if the Checkpoint was updated. + pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool { + let mut modified = false; + + if multi_xid.wrapping_sub(self.nextMulti) as i32 > 0 { + self.nextMulti = multi_xid; + modified = true; + } + + if multi_offset.wrapping_sub(self.nextMultiOffset) as i32 > 0 { + self.nextMultiOffset = multi_offset; + modified = true; + } + + modified + } } /// Generate new, empty WAL segment, with correct block headers at the first diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs index 496458b2e42d..750affc94eed 100644 --- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs +++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs @@ -202,6 +202,53 @@ pub fn test_update_next_xid() { assert_eq!(checkpoint.nextXid.value, 2048); } +#[test] +pub fn test_update_next_multixid() { + let checkpoint_buf = [0u8; std::mem::size_of::()]; + let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap(); + + // simple case + checkpoint.nextMulti = 20; + checkpoint.nextMultiOffset = 20; + checkpoint.update_next_multixid(1000, 2000); + assert_eq!(checkpoint.nextMulti, 1000); + assert_eq!(checkpoint.nextMultiOffset, 2000); + + // No change + checkpoint.update_next_multixid(500, 900); + assert_eq!(checkpoint.nextMulti, 1000); + assert_eq!(checkpoint.nextMultiOffset, 2000); + + // Close to wraparound, but not wrapped around yet + checkpoint.nextMulti = 0xffff0000; + checkpoint.nextMultiOffset = 0xfffe0000; + checkpoint.update_next_multixid(0xffff00ff, 0xfffe00ff); + assert_eq!(checkpoint.nextMulti, 0xffff00ff); + assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff); + + // Wraparound + checkpoint.update_next_multixid(1, 900); + assert_eq!(checkpoint.nextMulti, 1); + assert_eq!(checkpoint.nextMultiOffset, 900); + + // Wraparound nextMulti to 0. + // + // It's a bit surprising that nextMulti can be 0, because that's a special value + // (InvalidMultiXactId). However, that's how Postgres does it at multi-xid wraparound: + // nextMulti wraps around to 0, but then when the next multi-xid is assigned, it skips + // the 0 and the next multi-xid actually assigned is 1. + checkpoint.nextMulti = 0xffff0000; + checkpoint.nextMultiOffset = 0xfffe0000; + checkpoint.update_next_multixid(0, 0xfffe00ff); + assert_eq!(checkpoint.nextMulti, 0); + assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff); + + // Wraparound nextMultiOffset to 0 + checkpoint.update_next_multixid(0, 0); + assert_eq!(checkpoint.nextMulti, 0); + assert_eq!(checkpoint.nextMultiOffset, 0); +} + #[test] pub fn test_encode_logical_message() { let expected = [ diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs index 6e97b8c2a02c..ccbb90e3842e 100644 --- a/libs/pq_proto/src/framed.rs +++ b/libs/pq_proto/src/framed.rs @@ -44,9 +44,9 @@ impl ConnectionError { /// Wraps async io `stream`, providing messages to write/flush + read Postgres /// messages. pub struct Framed { - stream: S, - read_buf: BytesMut, - write_buf: BytesMut, + pub stream: S, + pub read_buf: BytesMut, + pub write_buf: BytesMut, } impl Framed { diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs index cee374201763..a01191bd5de3 100644 --- a/libs/pq_proto/src/lib.rs +++ b/libs/pq_proto/src/lib.rs @@ -39,14 +39,39 @@ pub enum FeMessage { PasswordMessage(Bytes), } +#[derive(Clone, Copy, PartialEq, PartialOrd)] +pub struct ProtocolVersion(u32); + +impl ProtocolVersion { + pub const fn new(major: u16, minor: u16) -> Self { + Self((major as u32) << 16 | minor as u32) + } + pub const fn minor(self) -> u16 { + self.0 as u16 + } + pub const fn major(self) -> u16 { + (self.0 >> 16) as u16 + } +} + +impl fmt::Debug for ProtocolVersion { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_list() + .entry(&self.major()) + .entry(&self.minor()) + .finish() + } +} + #[derive(Debug)] pub enum FeStartupPacket { CancelRequest(CancelKeyData), - SslRequest, + SslRequest { + direct: bool, + }, GssEncRequest, StartupMessage { - major_version: u32, - minor_version: u32, + version: ProtocolVersion, params: StartupMessageParams, }, } @@ -301,11 +326,23 @@ impl FeStartupPacket { /// different from [`FeMessage::parse`] because startup messages don't have /// message type byte; otherwise, its comments apply. pub fn parse(buf: &mut BytesMut) -> Result, ProtocolError> { + /// const MAX_STARTUP_PACKET_LENGTH: usize = 10000; - const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234; - const CANCEL_REQUEST_CODE: u32 = 5678; - const NEGOTIATE_SSL_CODE: u32 = 5679; - const NEGOTIATE_GSS_CODE: u32 = 5680; + const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234; + /// + const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678); + /// + const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679); + /// + const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680); + + // + // First byte indicates standard SSL handshake message + // (It can't be a Postgres startup length because in network byte order + // that would be a startup packet hundreds of megabytes long) + if buf.first() == Some(&0x16) { + return Ok(Some(FeStartupPacket::SslRequest { direct: true })); + } // need at least 4 bytes with packet len if buf.len() < 4 { @@ -338,12 +375,10 @@ impl FeStartupPacket { let mut msg = buf.split_to(len).freeze(); msg.advance(4); // consume len - let request_code = msg.get_u32(); - let req_hi = request_code >> 16; - let req_lo = request_code & ((1 << 16) - 1); + let request_code = ProtocolVersion(msg.get_u32()); // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code. - let message = match (req_hi, req_lo) { - (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => { + let message = match request_code { + CANCEL_REQUEST_CODE => { if msg.remaining() != 8 { return Err(ProtocolError::BadMessage( "CancelRequest message is malformed, backend PID / secret key missing" @@ -355,21 +390,22 @@ impl FeStartupPacket { cancel_key: msg.get_i32(), }) } - (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => { + NEGOTIATE_SSL_CODE => { // Requested upgrade to SSL (aka TLS) - FeStartupPacket::SslRequest + FeStartupPacket::SslRequest { direct: false } } - (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => { + NEGOTIATE_GSS_CODE => { // Requested upgrade to GSSAPI FeStartupPacket::GssEncRequest } - (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => { + version if version.major() == RESERVED_INVALID_MAJOR_VERSION => { return Err(ProtocolError::Protocol(format!( - "Unrecognized request code {unrecognized_code}" + "Unrecognized request code {}", + version.minor() ))); } // TODO bail if protocol major_version is not 3? - (major_version, minor_version) => { + version => { // StartupMessage let s = str::from_utf8(&msg).map_err(|_e| { @@ -382,8 +418,7 @@ impl FeStartupPacket { })?; FeStartupPacket::StartupMessage { - major_version, - minor_version, + version, params: StartupMessageParams { params: msg.slice_ref(s.as_bytes()), }, @@ -522,6 +557,10 @@ pub enum BeMessage<'a> { RowDescription(&'a [RowDescriptor<'a>]), XLogData(XLogDataBody<'a>), NoticeResponse(&'a str), + NegotiateProtocolVersion { + version: ProtocolVersion, + options: &'a [&'a str], + }, KeepAlive(WalSndKeepAlive), } @@ -945,6 +984,18 @@ impl<'a> BeMessage<'a> { buf.put_u8(u8::from(req.request_reply)); }); } + + BeMessage::NegotiateProtocolVersion { version, options } => { + buf.put_u8(b'v'); + write_body(buf, |buf| { + buf.put_u32(version.0); + buf.put_u32(options.len() as u32); + for option in options.iter() { + write_cstr(option, buf)?; + } + Ok(()) + })? + } } Ok(()) } diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs index 8a8f6212e99b..fa3f2cba58d7 100644 --- a/libs/remote_storage/src/config.rs +++ b/libs/remote_storage/src/config.rs @@ -1,6 +1,5 @@ use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration}; -use anyhow::bail; use aws_sdk_s3::types::StorageClass; use camino::Utf8PathBuf; @@ -176,20 +175,8 @@ fn serialize_storage_class( impl RemoteStorageConfig { pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120); - pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result> { - let document: toml_edit::Document = match toml { - toml_edit::Item::Table(toml) => toml.clone().into(), - toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => { - toml.clone().into_table().into() - } - _ => bail!("toml not a table or inline table"), - }; - - if document.is_empty() { - return Ok(None); - } - - Ok(Some(toml_edit::de::from_document(document)?)) + pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result { + Ok(utils::toml_edit_ext::deserialize_item(toml)?) } } @@ -197,7 +184,7 @@ impl RemoteStorageConfig { mod tests { use super::*; - fn parse(input: &str) -> anyhow::Result> { + fn parse(input: &str) -> anyhow::Result { let toml = input.parse::().unwrap(); RemoteStorageConfig::from_toml(toml.as_item()) } @@ -207,7 +194,7 @@ mod tests { let input = "local_path = '.' timeout = '5s'"; - let config = parse(input).unwrap().expect("it exists"); + let config = parse(input).unwrap(); assert_eq!( config, @@ -229,7 +216,7 @@ timeout = '5s'"; timeout = '7s' "; - let config = parse(toml).unwrap().expect("it exists"); + let config = parse(toml).unwrap(); assert_eq!( config, @@ -257,7 +244,7 @@ timeout = '5s'"; timeout = '7s' "; - let config = parse(toml).unwrap().expect("it exists"); + let config = parse(toml).unwrap(); assert_eq!( config, diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs index f05997ee6547..be005622199d 100644 --- a/libs/tenant_size_model/src/calculation.rs +++ b/libs/tenant_size_model/src/calculation.rs @@ -34,10 +34,10 @@ struct SegmentSize { } struct SizeAlternatives { - // cheapest alternative if parent is available. + /// cheapest alternative if parent is available. incremental: SegmentSize, - // cheapest alternative if parent node is not available + /// cheapest alternative if parent node is not available non_incremental: Option, } diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs index f26d3aa79d1a..0de2890bb414 100644 --- a/libs/tenant_size_model/src/svg.rs +++ b/libs/tenant_size_model/src/svg.rs @@ -3,10 +3,17 @@ use std::fmt::Write; const SVG_WIDTH: f32 = 500.0; +/// Different branch kind for SVG drawing. +#[derive(PartialEq)] +pub enum SvgBranchKind { + Timeline, + Lease, +} + struct SvgDraw<'a> { storage: &'a StorageModel, branches: &'a [String], - seg_to_branch: &'a [usize], + seg_to_branch: &'a [(usize, SvgBranchKind)], sizes: &'a [SegmentSizeResult], // layout @@ -42,13 +49,18 @@ fn draw_legend(result: &mut String) -> anyhow::Result<()> { "" )?; writeln!(result, "WAL not retained")?; + writeln!( + result, + "" + )?; + writeln!(result, "LSN lease")?; Ok(()) } pub fn draw_svg( storage: &StorageModel, branches: &[String], - seg_to_branch: &[usize], + seg_to_branch: &[(usize, SvgBranchKind)], sizes: &SizeResult, ) -> anyhow::Result { let mut draw = SvgDraw { @@ -100,7 +112,7 @@ impl<'a> SvgDraw<'a> { // Layout the timelines on Y dimension. // TODO - let mut y = 100.0; + let mut y = 120.0; let mut branch_y_coordinates = Vec::new(); for _branch in self.branches { branch_y_coordinates.push(y); @@ -109,7 +121,7 @@ impl<'a> SvgDraw<'a> { // Calculate coordinates for each point let seg_coordinates = std::iter::zip(segments, self.seg_to_branch) - .map(|(seg, branch_id)| { + .map(|(seg, (branch_id, _))| { let x = (seg.lsn - min_lsn) as f32 / xscale; let y = branch_y_coordinates[*branch_id]; (x, y) @@ -175,6 +187,22 @@ impl<'a> SvgDraw<'a> { // draw a snapshot point if it's needed let (coord_x, coord_y) = self.seg_coordinates[seg_id]; + + let (_, kind) = &self.seg_to_branch[seg_id]; + if kind == &SvgBranchKind::Lease { + let (x1, y1) = (coord_x, coord_y - 10.0); + let (x2, y2) = (coord_x, coord_y + 10.0); + + let style = "stroke-width=\"3\" stroke=\"blue\""; + + writeln!( + result, + "", + )?; + writeln!(result, " leased lsn at {}", seg.lsn)?; + writeln!(result, "")?; + } + if self.sizes[seg_id].method == SegmentMethod::SnapshotHere { writeln!( result, diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index a6a081c5c144..261ca2cc1ac0 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -40,6 +40,7 @@ thiserror.workspace = true tokio.workspace = true tokio-tar.workspace = true tokio-util.workspace = true +toml_edit.workspace = true tracing.workspace = true tracing-error.workspace = true tracing-subscriber = { workspace = true, features = ["json", "registry"] } diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs index 766bbfc9dfae..8b8ed5a67f39 100644 --- a/libs/utils/src/http/request.rs +++ b/libs/utils/src/http/request.rs @@ -74,6 +74,15 @@ pub fn parse_query_param>( .transpose() } +pub fn must_parse_query_param>( + request: &Request, + param_name: &str, +) -> Result { + parse_query_param(request, param_name)?.ok_or_else(|| { + ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters")) + }) +} + pub async fn ensure_no_body(request: &mut Request) -> Result<(), ApiError> { match request.body_mut().data().await { Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))), diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 2953f0aad4fd..711e617801ea 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -26,6 +26,8 @@ pub mod auth; // utility functions and helper traits for unified unique id generation/serialization etc. pub mod id; +pub mod shard; + mod hex; pub use hex::Hex; @@ -94,6 +96,8 @@ pub mod env; pub mod poison; +pub mod toml_edit_ext; + /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages /// /// we have several cases: diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs new file mode 100644 index 000000000000..4f9ac6bdb49a --- /dev/null +++ b/libs/utils/src/shard.rs @@ -0,0 +1,451 @@ +//! See `pageserver_api::shard` for description on sharding. + +use std::{ops::RangeInclusive, str::FromStr}; + +use hex::FromHex; +use serde::{Deserialize, Serialize}; + +use crate::id::TenantId; + +#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] +pub struct ShardNumber(pub u8); + +#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)] +pub struct ShardCount(pub u8); + +/// Combination of ShardNumber and ShardCount. For use within the context of a particular tenant, +/// when we need to know which shard we're dealing with, but do not need to know the full +/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know +/// the fully qualified TenantShardId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct ShardIndex { + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + +/// Formatting helper, for generating the `shard_id` label in traces. +pub struct ShardSlug<'a>(&'a TenantShardId); + +/// TenantShardId globally identifies a particular shard in a particular tenant. +/// +/// These are written as `-`, for example: +/// # The second shard in a two-shard tenant +/// 072f1291a5310026820b2fe4b2968934-0102 +/// +/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without +/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables +/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`]. +/// +/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs, +/// is both forward and backward compatible with TenantId: a legacy TenantId can be +/// decoded as a TenantShardId, and when re-encoded it will be parseable +/// as a TenantId. +#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)] +pub struct TenantShardId { + pub tenant_id: TenantId, + pub shard_number: ShardNumber, + pub shard_count: ShardCount, +} + +impl ShardCount { + pub const MAX: Self = Self(u8::MAX); + + /// The internal value of a ShardCount may be zero, which means "1 shard, but use + /// legacy format for TenantShardId that excludes the shard suffix", also known + /// as [`TenantShardId::unsharded`]. + /// + /// This method returns the actual number of shards, i.e. if our internal value is + /// zero, we return 1 (unsharded tenants have 1 shard). + pub fn count(&self) -> u8 { + if self.0 > 0 { + self.0 + } else { + 1 + } + } + + /// The literal internal value: this is **not** the number of shards in the + /// tenant, as we have a special zero value for legacy unsharded tenants. Use + /// [`Self::count`] if you want to know the cardinality of shards. + pub fn literal(&self) -> u8 { + self.0 + } + + /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but + /// uses the legacy format for `TenantShardId`. See also the documentation for + /// [`Self::count`]. + pub fn is_unsharded(&self) -> bool { + self.0 == 0 + } + + /// `v` may be zero, or the number of shards in the tenant. `v` is what + /// [`Self::literal`] would return. + pub const fn new(val: u8) -> Self { + Self(val) + } +} + +impl ShardNumber { + pub const MAX: Self = Self(u8::MAX); +} + +impl TenantShardId { + pub fn unsharded(tenant_id: TenantId) -> Self { + Self { + tenant_id, + shard_number: ShardNumber(0), + shard_count: ShardCount(0), + } + } + + /// The range of all TenantShardId that belong to a particular TenantId. This is useful when + /// you have a BTreeMap of TenantShardId, and are querying by TenantId. + pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive { + RangeInclusive::new( + Self { + tenant_id, + shard_number: ShardNumber(0), + shard_count: ShardCount(0), + }, + Self { + tenant_id, + shard_number: ShardNumber::MAX, + shard_count: ShardCount::MAX, + }, + ) + } + + pub fn shard_slug(&self) -> impl std::fmt::Display + '_ { + ShardSlug(self) + } + + /// Convenience for code that has special behavior on the 0th shard. + pub fn is_shard_zero(&self) -> bool { + self.shard_number == ShardNumber(0) + } + + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. + pub fn is_unsharded(&self) -> bool { + self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded() + } + + /// Convenience for dropping the tenant_id and just getting the ShardIndex: this + /// is useful when logging from code that is already in a span that includes tenant ID, to + /// keep messages reasonably terse. + pub fn to_index(&self) -> ShardIndex { + ShardIndex { + shard_number: self.shard_number, + shard_count: self.shard_count, + } + } + + /// Calculate the children of this TenantShardId when splitting the overall tenant into + /// the given number of shards. + pub fn split(&self, new_shard_count: ShardCount) -> Vec { + let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1); + let mut child_shards = Vec::new(); + for shard_number in 0..ShardNumber(new_shard_count.0).0 { + // Key mapping is based on a round robin mapping of key hash modulo shard count, + // so our child shards are the ones which the same keys would map to. + if shard_number % effective_old_shard_count == self.shard_number.0 { + child_shards.push(TenantShardId { + tenant_id: self.tenant_id, + shard_number: ShardNumber(shard_number), + shard_count: new_shard_count, + }) + } + } + + child_shards + } +} + +impl<'a> std::fmt::Display for ShardSlug<'a> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!( + f, + "{:02x}{:02x}", + self.0.shard_number.0, self.0.shard_count.0 + ) + } +} + +impl std::fmt::Display for TenantShardId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + if self.shard_count != ShardCount(0) { + write!(f, "{}-{}", self.tenant_id, self.shard_slug()) + } else { + // Legacy case (shard_count == 0) -- format as just the tenant id. Note that this + // is distinct from the normal single shard case (shard count == 1). + self.tenant_id.fmt(f) + } + } +} + +impl std::fmt::Debug for TenantShardId { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // Debug is the same as Display: the compact hex representation + write!(f, "{}", self) + } +} + +impl std::str::FromStr for TenantShardId { + type Err = hex::FromHexError; + + fn from_str(s: &str) -> Result { + // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count + if s.len() == 32 { + // Legacy case: no shard specified + Ok(Self { + tenant_id: TenantId::from_str(s)?, + shard_number: ShardNumber(0), + shard_count: ShardCount(0), + }) + } else if s.len() == 37 { + let bytes = s.as_bytes(); + let tenant_id = TenantId::from_hex(&bytes[0..32])?; + let mut shard_parts: [u8; 2] = [0u8; 2]; + hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?; + Ok(Self { + tenant_id, + shard_number: ShardNumber(shard_parts[0]), + shard_count: ShardCount(shard_parts[1]), + }) + } else { + Err(hex::FromHexError::InvalidStringLength) + } + } +} + +impl From<[u8; 18]> for TenantShardId { + fn from(b: [u8; 18]) -> Self { + let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap(); + + Self { + tenant_id: TenantId::from(tenant_id_bytes), + shard_number: ShardNumber(b[16]), + shard_count: ShardCount(b[17]), + } + } +} + +impl ShardIndex { + pub fn new(number: ShardNumber, count: ShardCount) -> Self { + Self { + shard_number: number, + shard_count: count, + } + } + pub fn unsharded() -> Self { + Self { + shard_number: ShardNumber(0), + shard_count: ShardCount(0), + } + } + + /// The "unsharded" value is distinct from simply having a single shard: it represents + /// a tenant which is not shard-aware at all, and whose storage paths will not include + /// a shard suffix. + pub fn is_unsharded(&self) -> bool { + self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0) + } + + /// For use in constructing remote storage paths: concatenate this with a TenantId + /// to get a fully qualified TenantShardId. + /// + /// Backward compat: this function returns an empty string if Self::is_unsharded, such + /// that the legacy pre-sharding remote key format is preserved. + pub fn get_suffix(&self) -> String { + if self.is_unsharded() { + "".to_string() + } else { + format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0) + } + } +} + +impl std::fmt::Display for ShardIndex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0) + } +} + +impl std::fmt::Debug for ShardIndex { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + // Debug is the same as Display: the compact hex representation + write!(f, "{}", self) + } +} + +impl std::str::FromStr for ShardIndex { + type Err = hex::FromHexError; + + fn from_str(s: &str) -> Result { + // Expect format: 1 byte shard number, 1 byte shard count + if s.len() == 4 { + let bytes = s.as_bytes(); + let mut shard_parts: [u8; 2] = [0u8; 2]; + hex::decode_to_slice(bytes, &mut shard_parts)?; + Ok(Self { + shard_number: ShardNumber(shard_parts[0]), + shard_count: ShardCount(shard_parts[1]), + }) + } else { + Err(hex::FromHexError::InvalidStringLength) + } + } +} + +impl From<[u8; 2]> for ShardIndex { + fn from(b: [u8; 2]) -> Self { + Self { + shard_number: ShardNumber(b[0]), + shard_count: ShardCount(b[1]), + } + } +} + +impl Serialize for TenantShardId { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + if serializer.is_human_readable() { + serializer.collect_str(self) + } else { + // Note: while human encoding of [`TenantShardId`] is backward and forward + // compatible, this binary encoding is not. + let mut packed: [u8; 18] = [0; 18]; + packed[0..16].clone_from_slice(&self.tenant_id.as_arr()); + packed[16] = self.shard_number.0; + packed[17] = self.shard_count.0; + + packed.serialize(serializer) + } + } +} + +impl<'de> Deserialize<'de> for TenantShardId { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct IdVisitor { + is_human_readable_deserializer: bool, + } + + impl<'de> serde::de::Visitor<'de> for IdVisitor { + type Value = TenantShardId; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + if self.is_human_readable_deserializer { + formatter.write_str("value in form of hex string") + } else { + formatter.write_str("value in form of integer array([u8; 18])") + } + } + + fn visit_seq(self, seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let s = serde::de::value::SeqAccessDeserializer::new(seq); + let id: [u8; 18] = Deserialize::deserialize(s)?; + Ok(TenantShardId::from(id)) + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + TenantShardId::from_str(v).map_err(E::custom) + } + } + + if deserializer.is_human_readable() { + deserializer.deserialize_str(IdVisitor { + is_human_readable_deserializer: true, + }) + } else { + deserializer.deserialize_tuple( + 18, + IdVisitor { + is_human_readable_deserializer: false, + }, + ) + } + } +} + +impl Serialize for ShardIndex { + fn serialize(&self, serializer: S) -> Result + where + S: serde::Serializer, + { + if serializer.is_human_readable() { + serializer.collect_str(self) + } else { + // Binary encoding is not used in index_part.json, but is included in anticipation of + // switching various structures (e.g. inter-process communication, remote metadata) to more + // compact binary encodings in future. + let mut packed: [u8; 2] = [0; 2]; + packed[0] = self.shard_number.0; + packed[1] = self.shard_count.0; + packed.serialize(serializer) + } + } +} + +impl<'de> Deserialize<'de> for ShardIndex { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + struct IdVisitor { + is_human_readable_deserializer: bool, + } + + impl<'de> serde::de::Visitor<'de> for IdVisitor { + type Value = ShardIndex; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + if self.is_human_readable_deserializer { + formatter.write_str("value in form of hex string") + } else { + formatter.write_str("value in form of integer array([u8; 2])") + } + } + + fn visit_seq(self, seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let s = serde::de::value::SeqAccessDeserializer::new(seq); + let id: [u8; 2] = Deserialize::deserialize(s)?; + Ok(ShardIndex::from(id)) + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + ShardIndex::from_str(v).map_err(E::custom) + } + } + + if deserializer.is_human_readable() { + deserializer.deserialize_str(IdVisitor { + is_human_readable_deserializer: true, + }) + } else { + deserializer.deserialize_tuple( + 2, + IdVisitor { + is_human_readable_deserializer: false, + }, + ) + } + } +} diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs new file mode 100644 index 000000000000..ab5f7bdd95ab --- /dev/null +++ b/libs/utils/src/toml_edit_ext.rs @@ -0,0 +1,22 @@ +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("item is not a document")] + ItemIsNotADocument, + #[error(transparent)] + Serde(toml_edit::de::Error), +} + +pub fn deserialize_item(item: &toml_edit::Item) -> Result +where + T: serde::de::DeserializeOwned, +{ + let document: toml_edit::Document = match item { + toml_edit::Item::Table(toml) => toml.clone().into(), + toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => { + toml.clone().into_table().into() + } + _ => return Err(Error::ItemIsNotADocument), + }; + + toml_edit::de::from_document(document).map_err(Error::Serde) +} diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 4335f38f1e7f..0d9343d64382 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -62,6 +62,7 @@ sync_wrapper.workspace = true sysinfo.workspace = true tokio-tar.workspace = true thiserror.workspace = true +tikv-jemallocator.workspace = true tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] } tokio-epoll-uring.workspace = true tokio-io-timeout.workspace = true diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs index 5aab10e5d9c0..edc09d0bf22a 100644 --- a/pageserver/benches/bench_walredo.rs +++ b/pageserver/benches/bench_walredo.rs @@ -48,6 +48,7 @@ //! medium/128 time: [8.8311 ms 8.9849 ms 9.1263 ms] //! ``` +use anyhow::Context; use bytes::{Buf, Bytes}; use criterion::{BenchmarkId, Criterion}; use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager}; @@ -188,6 +189,7 @@ impl Request { manager .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version) .await + .context("request_redo") } fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord { diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml index 0ed27602cd3c..a938367334fa 100644 --- a/pageserver/client/Cargo.toml +++ b/pageserver/client/Cargo.toml @@ -8,7 +8,7 @@ license.workspace = true pageserver_api.workspace = true thiserror.workspace = true async-trait.workspace = true -reqwest.workspace = true +reqwest = { workspace = true, features = [ "stream" ] } utils.workspace = true serde.workspace = true workspace_hack = { version = "0.1", path = "../../workspace_hack" } diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs index 69b86d9c466a..e3ddb446fa2c 100644 --- a/pageserver/client/src/mgmt_api.rs +++ b/pageserver/client/src/mgmt_api.rs @@ -9,6 +9,8 @@ use utils::{ lsn::Lsn, }; +pub use reqwest::Body as ReqwestBody; + pub mod util; #[derive(Debug, Clone)] @@ -20,6 +22,9 @@ pub struct Client { #[derive(thiserror::Error, Debug)] pub enum Error { + #[error("send request: {0}")] + SendRequest(reqwest::Error), + #[error("receive body: {0}")] ReceiveBody(reqwest::Error), @@ -173,19 +178,30 @@ impl Client { self.request(Method::GET, uri, ()).await } - async fn request_noerror( + fn start_request( &self, method: Method, uri: U, - body: B, - ) -> Result { + ) -> reqwest::RequestBuilder { let req = self.client.request(method, uri); - let req = if let Some(value) = &self.authorization_header { + if let Some(value) = &self.authorization_header { req.header(reqwest::header::AUTHORIZATION, value) } else { req - }; - req.json(&body).send().await.map_err(Error::ReceiveBody) + } + } + + async fn request_noerror( + &self, + method: Method, + uri: U, + body: B, + ) -> Result { + self.start_request(method, uri) + .json(&body) + .send() + .await + .map_err(Error::ReceiveBody) } async fn request( @@ -205,15 +221,6 @@ impl Client { Ok(()) } - pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result { - let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint); - self.request(Method::POST, &uri, req) - .await? - .json() - .await - .map_err(Error::ReceiveBody) - } - /// The tenant deletion API can return 202 if deletion is incomplete, or /// 404 if it is complete. Callers are responsible for checking the status /// code and retrying. Error codes other than 404 will return Err(). @@ -618,4 +625,53 @@ impl Client { }), } } + + pub async fn import_basebackup( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + base_lsn: Lsn, + end_lsn: Lsn, + pg_version: u32, + basebackup_tarball: ReqwestBody, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}", + self.mgmt_api_endpoint, + ); + self.start_request(Method::PUT, uri) + .body(basebackup_tarball) + .send() + .await + .map_err(Error::SendRequest)? + .error_from_body() + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } + + pub async fn import_wal( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + start_lsn: Lsn, + end_lsn: Lsn, + wal_tarball: ReqwestBody, + ) -> Result<()> { + let uri = format!( + "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}", + self.mgmt_api_endpoint, + ); + self.start_request(Method::PUT, uri) + .body(wal_tarball) + .send() + .await + .map_err(Error::SendRequest)? + .error_from_body() + .await? + .json() + .await + .map_err(Error::ReceiveBody) + } } diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs index 50c3ac4c6143..ea09a011e5cf 100644 --- a/pageserver/ctl/src/main.rs +++ b/pageserver/ctl/src/main.rs @@ -178,7 +178,7 @@ async fn main() -> anyhow::Result<()> { let toml_item = toml_document .get("remote_storage") .expect("need remote_storage"); - let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config"); + let config = RemoteStorageConfig::from_toml(toml_item)?; let storage = remote_storage::GenericRemoteStorage::from_config(&config); let cancel = CancellationToken::new(); storage diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs index 0f057a43683c..207f781e1b27 100644 --- a/pageserver/src/basebackup.rs +++ b/pageserver/src/basebackup.rs @@ -348,35 +348,36 @@ where self.add_rel(rel, rel).await?; } } + } - for (path, content) in self - .timeline - .list_aux_files(self.lsn, self.ctx) - .await - .map_err(|e| BasebackupError::Server(e.into()))? - { - if path.starts_with("pg_replslot") { - let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; - let restart_lsn = Lsn(u64::from_le_bytes( - content[offs..offs + 8].try_into().unwrap(), - )); - info!("Replication slot {} restart LSN={}", path, restart_lsn); - min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn); - } else if path == "pg_logical/replorigin_checkpoint" { - // replorigin_checkoint is written only on compute shutdown, so it contains - // deteriorated values. So we generate our own version of this file for the particular LSN - // based on information about replorigins extracted from transaction commit records. - // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all, - // but now we should handle (skip) it for backward compatibility. - continue; - } - let header = new_tar_header(&path, content.len() as u64)?; - self.ar - .append(&header, &*content) - .await - .context("could not add aux file to basebackup tarball")?; + for (path, content) in self + .timeline + .list_aux_files(self.lsn, self.ctx) + .await + .map_err(|e| BasebackupError::Server(e.into()))? + { + if path.starts_with("pg_replslot") { + let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN; + let restart_lsn = Lsn(u64::from_le_bytes( + content[offs..offs + 8].try_into().unwrap(), + )); + info!("Replication slot {} restart LSN={}", path, restart_lsn); + min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn); + } else if path == "pg_logical/replorigin_checkpoint" { + // replorigin_checkoint is written only on compute shutdown, so it contains + // deteriorated values. So we generate our own version of this file for the particular LSN + // based on information about replorigins extracted from transaction commit records. + // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all, + // but now we should handle (skip) it for backward compatibility. + continue; } + let header = new_tar_header(&path, content.len() as u64)?; + self.ar + .append(&header, &*content) + .await + .context("could not add aux file to basebackup tarball")?; } + if min_restart_lsn != Lsn::MAX { info!( "Min restart LSN for logical replication is {}", diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index ba5b2608bdf1..9f705f0bc923 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -47,6 +47,9 @@ use utils::{ project_git_version!(GIT_VERSION); project_build_tag!(BUILD_TAG); +#[global_allocator] +static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc; + const PID_FILE_NAME: &str = "pageserver.pid"; const FEATURES: &[&str] = &[ @@ -421,6 +424,10 @@ fn start_pageserver( background_jobs_can_start: background_jobs_barrier.clone(), }; + info!(config=?conf.l0_flush, "using l0_flush config"); + let l0_flush_global_state = + pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone()); + // Scan the local 'tenants/' directory and start loading the tenants let deletion_queue_client = deletion_queue.new_client(); let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr( @@ -429,6 +436,7 @@ fn start_pageserver( broker_client: broker_client.clone(), remote_storage: remote_storage.clone(), deletion_queue_client, + l0_flush_global_state, }, order, shutdown_pageserver.clone(), @@ -652,7 +660,6 @@ fn start_pageserver( async move { page_service::libpq_listener_main( tenant_manager, - broker_client, pg_auth, pageserver_listener, conf.pg_auth_type, diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index f36e63f035c7..17bc427b2cf1 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -5,7 +5,7 @@ //! See also `settings.md` for better description on every parameter. use anyhow::{anyhow, bail, ensure, Context, Result}; -use pageserver_api::shard::TenantShardId; +use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId}; use remote_storage::{RemotePath, RemoteStorageConfig}; use serde; use serde::de::IntoDeserializer; @@ -30,11 +30,11 @@ use utils::{ logging::LogFormat, }; -use crate::tenant::timeline::GetVectoredImpl; use crate::tenant::vectored_blob_io::MaxVectoredReadBytes; use crate::tenant::{config::TenantConfOpt, timeline::GetImpl}; use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME}; use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine}; +use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl}; use crate::{tenant::config::TenantConf, virtual_file}; use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX}; @@ -50,6 +50,7 @@ pub mod defaults { DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR, DEFAULT_PG_LISTEN_PORT, }; + use pageserver_api::models::ImageCompressionAlgorithm; pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT; pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s"; @@ -90,6 +91,9 @@ pub mod defaults { pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB + pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = + ImageCompressionAlgorithm::Disabled; + pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true; pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0; @@ -159,7 +163,7 @@ pub mod defaults { #ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB} -[remote_storage] +#[remote_storage] "# ); @@ -285,12 +289,16 @@ pub struct PageServerConf { pub validate_vectored_get: bool, + pub image_compression: ImageCompressionAlgorithm, + /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM. When this /// is exceeded, we start proactively closing ephemeral layers to limit the total amount /// of ephemeral data. /// /// Setting this to zero disables limits on total ephemeral layer size. pub ephemeral_bytes_per_memory_kb: usize, + + pub l0_flush: L0FlushConfig, } /// We do not want to store this in a PageServerConf because the latter may be logged @@ -395,7 +403,11 @@ struct PageServerConfigBuilder { validate_vectored_get: BuilderValue, + image_compression: BuilderValue, + ephemeral_bytes_per_memory_kb: BuilderValue, + + l0_flush: BuilderValue, } impl PageServerConfigBuilder { @@ -482,8 +494,10 @@ impl PageServerConfigBuilder { max_vectored_read_bytes: Set(MaxVectoredReadBytes( NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(), )), + image_compression: Set(DEFAULT_IMAGE_COMPRESSION), validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET), ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB), + l0_flush: Set(L0FlushConfig::default()), } } } @@ -667,10 +681,18 @@ impl PageServerConfigBuilder { self.validate_vectored_get = BuilderValue::Set(value); } + pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) { + self.image_compression = BuilderValue::Set(value); + } + pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) { self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value); } + pub fn l0_flush(&mut self, value: L0FlushConfig) { + self.l0_flush = BuilderValue::Set(value); + } + pub fn build(self) -> anyhow::Result { let default = Self::default_values(); @@ -727,7 +749,9 @@ impl PageServerConfigBuilder { get_impl, max_vectored_read_bytes, validate_vectored_get, + image_compression, ephemeral_bytes_per_memory_kb, + l0_flush, } CUSTOM LOGIC { @@ -918,7 +942,7 @@ impl PageServerConf { "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?), "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?), "remote_storage" => { - builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?) + builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?)) } "tenant_config" => { t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?; @@ -946,7 +970,7 @@ impl PageServerConf { builder.metric_collection_endpoint(Some(endpoint)); }, "metric_collection_bucket" => { - builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?) + builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?)) } "synthetic_size_calculation_interval" => builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?), @@ -1004,9 +1028,15 @@ impl PageServerConf { "validate_vectored_get" => { builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?) } + "image_compression" => { + builder.get_image_compression(parse_toml_from_str("image_compression", item)?) + } "ephemeral_bytes_per_memory_kb" => { builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize) } + "l0_flush" => { + builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?) + } _ => bail!("unrecognized pageserver option '{key}'"), } } @@ -1088,8 +1118,10 @@ impl PageServerConf { NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES) .expect("Invalid default constant"), ), + image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + l0_flush: L0FlushConfig::default(), } } } @@ -1328,7 +1360,9 @@ background_task_maximum_delay = '334 s' .expect("Invalid default constant") ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + l0_flush: L0FlushConfig::default(), }, "Correct defaults should be used when no config values are provided" ); @@ -1401,7 +1435,9 @@ background_task_maximum_delay = '334 s' .expect("Invalid default constant") ), validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET, + image_compression: defaults::DEFAULT_IMAGE_COMPRESSION, ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB, + l0_flush: L0FlushConfig::default(), }, "Should be able to parse all basic config values correctly" ); @@ -1681,6 +1717,19 @@ threshold = "20m" } } + #[test] + fn empty_remote_storage_is_error() { + let tempdir = tempdir().unwrap(); + let (workdir, _) = prepare_fs(&tempdir).unwrap(); + let input = r#" +remote_storage = {} + "#; + let doc = toml_edit::Document::from_str(input).unwrap(); + let err = PageServerConf::parse_and_validate(&doc, &workdir) + .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage"); + assert!(format!("{err}").contains("remote_storage"), "{err}"); + } + fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> { let tempdir_path = tempdir.path(); diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs index bf06c78e673f..d215fd2b7d2d 100644 --- a/pageserver/src/deletion_queue/validator.rs +++ b/pageserver/src/deletion_queue/validator.rs @@ -190,7 +190,7 @@ where } } else { // If we failed validation, then do not apply any of the projected updates - warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); + info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation); metrics::DELETION_QUEUE.dropped_lsn_updates.inc(); } } @@ -225,7 +225,7 @@ where && (tenant.generation == *validated_generation); if !this_list_valid { - warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation); + info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation); metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64); mutated = true; } else { diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml index 58ff6e3f83cc..5ba329f05ece 100644 --- a/pageserver/src/http/openapi_spec.yml +++ b/pageserver/src/http/openapi_spec.yml @@ -265,15 +265,19 @@ paths: type: string format: hex post: - description: Obtain lease for the given LSN - parameters: - - name: lsn - in: query - required: true - schema: - type: string - format: hex - description: A LSN to obtain the lease for + description: Obtains a lease for the given LSN. + requestBody: + content: + application/json: + schema: + type: object + required: + - lsn + properties: + lsn: + description: A LSN to obtain the lease for. + type: string + format: hex responses: "200": description: OK diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 5ebd34a40690..6f8f3e6389d5 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -10,6 +10,7 @@ use std::time::Duration; use anyhow::{anyhow, Context, Result}; use enumset::EnumSet; +use futures::StreamExt; use futures::TryFutureExt; use humantime::format_rfc3339; use hyper::header; @@ -22,6 +23,7 @@ use pageserver_api::models::ListAuxFilesRequest; use pageserver_api::models::LocationConfig; use pageserver_api::models::LocationConfigListResponse; use pageserver_api::models::LsnLease; +use pageserver_api::models::LsnLeaseRequest; use pageserver_api::models::ShardParameters; use pageserver_api::models::TenantDetails; use pageserver_api::models::TenantLocationConfigResponse; @@ -42,18 +44,19 @@ use pageserver_api::shard::TenantShardId; use remote_storage::DownloadError; use remote_storage::GenericRemoteStorage; use remote_storage::TimeTravelError; -use tenant_size_model::{SizeResult, StorageModel}; +use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel}; +use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::*; use utils::auth::JwtAuth; use utils::failpoint_support::failpoints_handler; use utils::http::endpoint::prometheus_metrics_handler; use utils::http::endpoint::request_span; +use utils::http::request::must_parse_query_param; use utils::http::request::{get_request_param, must_get_query_param, parse_query_param}; use crate::context::{DownloadBehavior, RequestContext}; use crate::deletion_queue::DeletionQueueClient; -use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL}; use crate::pgdatadir_mapping::LsnForTimestamp; use crate::task_mgr::TaskKind; use crate::tenant::config::{LocationConf, TenantConfOpt}; @@ -75,13 +78,12 @@ use crate::tenant::timeline::CompactFlags; use crate::tenant::timeline::CompactionError; use crate::tenant::timeline::Timeline; use crate::tenant::GetTimelineError; -use crate::tenant::SpawnMode; use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError}; use crate::{config::PageServerConf, tenant::mgr}; use crate::{disk_usage_eviction_task, tenant}; use pageserver_api::models::{ - StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo, - TimelineCreateRequest, TimelineGcRequest, TimelineInfo, + StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest, + TimelineInfo, }; use utils::{ auth::SwappableJwtAuth, @@ -229,7 +231,7 @@ impl From for ApiError { BadRequest(e) => ApiError::BadRequest(e), Unavailable(_) => ApiError::ShuttingDown, e @ InProgress => ApiError::Conflict(format!("{e}")), - Flush(e) | Other(e) => ApiError::InternalServerError(e), + Flush(e) | InternalError(e) => ApiError::InternalServerError(e), } } } @@ -408,6 +410,8 @@ async fn build_timeline_info_common( let walreceiver_status = timeline.walreceiver_status(); + let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats(); + let info = TimelineInfo { tenant_id: timeline.tenant_shard_id, timeline_id: timeline.timeline_id, @@ -428,6 +432,8 @@ async fn build_timeline_info_common( directory_entries_counts: timeline.get_directory_metrics().to_vec(), current_physical_size, current_logical_size_non_incremental: None, + pitr_history_size, + within_ancestor_pitr, timeline_dir_layer_file_size_sum: None, wal_source_connstr, last_received_msg_lsn, @@ -1193,10 +1199,15 @@ fn synthetic_size_html_response( timeline_map.insert(ti.timeline_id, index); timeline_ids.push(ti.timeline_id.to_string()); } - let seg_to_branch: Vec = inputs + let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs .segments .iter() - .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap()) + .map(|seg| { + ( + *timeline_map.get(&seg.timeline_id).unwrap(), + seg.kind.into(), + ) + }) .collect(); let svg = @@ -1237,75 +1248,6 @@ pub fn html_response(status: StatusCode, data: String) -> Result, Ok(response) } -/// Helper for requests that may take a generation, which is mandatory -/// when control_plane_api is set, but otherwise defaults to Generation::none() -fn get_request_generation(state: &State, req_gen: Option) -> Result { - if state.conf.control_plane_api.is_some() { - req_gen - .map(Generation::new) - .ok_or(ApiError::BadRequest(anyhow!( - "generation attribute missing" - ))) - } else { - // Legacy mode: all tenants operate with no generation - Ok(Generation::none()) - } -} - -async fn tenant_create_handler( - mut request: Request, - _cancel: CancellationToken, -) -> Result, ApiError> { - let request_data: TenantCreateRequest = json_request(&mut request).await?; - let target_tenant_id = request_data.new_tenant_id; - check_permission(&request, None)?; - - let _timer = STORAGE_TIME_GLOBAL - .get_metric_with_label_values(&[StorageTimeOperation::CreateTenant.into()]) - .expect("bug") - .start_timer(); - - let tenant_conf = - TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?; - - let state = get_state(&request); - - let generation = get_request_generation(state, request_data.generation)?; - - let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); - - let location_conf = - LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters); - - let new_tenant = state - .tenant_manager - .upsert_location( - target_tenant_id, - location_conf, - None, - SpawnMode::Create, - &ctx, - ) - .await?; - - let Some(new_tenant) = new_tenant else { - // This should never happen: indicates a bug in upsert_location - return Err(ApiError::InternalServerError(anyhow::anyhow!( - "Upsert succeeded but didn't return tenant!" - ))); - }; - // We created the tenant. Existing API semantics are that the tenant - // is Active when this function returns. - new_tenant - .wait_to_become_active(ACTIVE_TENANT_TIMEOUT) - .await?; - - json_response( - StatusCode::CREATED, - TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id), - ) -} - async fn get_tenant_config_handler( request: Request, _cancel: CancellationToken, @@ -1367,7 +1309,7 @@ async fn update_tenant_config_handler( crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf) .await - .map_err(ApiError::InternalServerError)?; + .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?; tenant.set_new_tenant_config(new_tenant_conf); json_response(StatusCode::OK, ()) @@ -1598,15 +1540,13 @@ async fn handle_tenant_break( // Obtains an lsn lease on the given timeline. async fn lsn_lease_handler( - request: Request, + mut request: Request, _cancel: CancellationToken, ) -> Result, ApiError> { let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?; let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; check_permission(&request, Some(tenant_shard_id.tenant_id))?; - - let lsn: Lsn = parse_query_param(&request, "lsn")? - .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?; + let lsn = json_request::(&mut request).await?.lsn; let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download); @@ -2467,6 +2407,189 @@ async fn post_top_tenants( ) } +async fn put_tenant_timeline_import_basebackup( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?; + let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?; + let pg_version: u32 = must_parse_query_param(&request, "pg_version")?; + + check_permission(&request, Some(tenant_id))?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + + let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version); + async move { + let state = get_state(&request); + let tenant = state + .tenant_manager + .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?; + + let broker_client = state.broker_client.clone(); + + let mut body = StreamReader::new(request.into_body().map(|res| { + res.map_err(|error| { + std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error)) + }) + })); + + tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?; + + let timeline = tenant + .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) + .map_err(ApiError::InternalServerError) + .await?; + + // TODO mark timeline as not ready until it reaches end_lsn. + // We might have some wal to import as well, and we should prevent compute + // from connecting before that and writing conflicting wal. + // + // This is not relevant for pageserver->pageserver migrations, since there's + // no wal to import. But should be fixed if we want to import from postgres. + + // TODO leave clean state on error. For now you can use detach to clean + // up broken state from a failed import. + + // Import basebackup provided via CopyData + info!("importing basebackup"); + + timeline + .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx) + .await + .map_err(ApiError::InternalServerError)?; + + // Read the end of the tar archive. + read_tar_eof(body) + .await + .map_err(ApiError::InternalServerError)?; + + // TODO check checksum + // Meanwhile you can verify client-side by taking fullbackup + // and checking that it matches in size with what was imported. + // It wouldn't work if base came from vanilla postgres though, + // since we discard some log files. + + info!("done"); + json_response(StatusCode::OK, ()) + } + .instrument(span) + .await +} + +async fn put_tenant_timeline_import_wal( + request: Request, + _cancel: CancellationToken, +) -> Result, ApiError> { + let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?; + let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?; + let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?; + let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?; + + check_permission(&request, Some(tenant_id))?; + + let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn); + + let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn); + async move { + let state = get_state(&request); + + let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?; + + let mut body = StreamReader::new(request.into_body().map(|res| { + res.map_err(|error| { + std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error)) + }) + })); + + let last_record_lsn = timeline.get_last_record_lsn(); + if last_record_lsn != start_lsn { + return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))); + } + + // TODO leave clean state on error. For now you can use detach to clean + // up broken state from a failed import. + + // Import wal provided via CopyData + info!("importing wal"); + crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?; + info!("wal import complete"); + + // Read the end of the tar archive. + read_tar_eof(body).await.map_err(ApiError::InternalServerError)?; + + // TODO Does it make sense to overshoot? + if timeline.get_last_record_lsn() < end_lsn { + return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))); + } + + // Flush data to disk, then upload to s3. No need for a forced checkpoint. + // We only want to persist the data, and it doesn't matter if it's in the + // shape of deltas or images. + info!("flushing layers"); + timeline.freeze_and_flush().await.map_err(|e| match e { + tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown, + other => ApiError::InternalServerError(anyhow::anyhow!(other)), + })?; + + info!("done"); + + json_response(StatusCode::OK, ()) + }.instrument(span).await +} + +/// Read the end of a tar archive. +/// +/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. +/// `tokio_tar` already read the first such block. Read the second all-zeros block, +/// and check that there is no more data after the EOF marker. +/// +/// 'tar' command can also write extra blocks of zeros, up to a record +/// size, controlled by the --record-size argument. Ignore them too. +async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> { + use tokio::io::AsyncReadExt; + let mut buf = [0u8; 512]; + + // Read the all-zeros block, and verify it + let mut total_bytes = 0; + while total_bytes < 512 { + let nbytes = reader.read(&mut buf[total_bytes..]).await?; + total_bytes += nbytes; + if nbytes == 0 { + break; + } + } + if total_bytes < 512 { + anyhow::bail!("incomplete or invalid tar EOF marker"); + } + if !buf.iter().all(|&x| x == 0) { + anyhow::bail!("invalid tar EOF marker"); + } + + // Drain any extra zero-blocks after the EOF marker + let mut trailing_bytes = 0; + let mut seen_nonzero_bytes = false; + loop { + let nbytes = reader.read(&mut buf).await?; + trailing_bytes += nbytes; + if !buf.iter().all(|&x| x == 0) { + seen_nonzero_bytes = true; + } + if nbytes == 0 { + break; + } + } + if seen_nonzero_bytes { + anyhow::bail!("unexpected non-zero bytes after the tar archive"); + } + if trailing_bytes % 512 != 0 { + anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive"); + } + Ok(()) +} + /// Common functionality of all the HTTP API handlers. /// /// - Adds a tracing span to each request (by `request_span`) @@ -2611,7 +2734,6 @@ pub fn make_router( api_handler(r, reload_auth_validation_keys_handler) }) .get("/v1/tenant", |r| api_handler(r, tenant_list_handler)) - .post("/v1/tenant", |r| api_handler(r, tenant_create_handler)) .get("/v1/tenant/:tenant_shard_id", |r| { api_handler(r, tenant_status) }) @@ -2762,5 +2884,13 @@ pub fn make_router( "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info", |r| testing_api_handler("perf_info", r, perf_info), ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup", + |r| api_handler(r, put_tenant_timeline_import_basebackup), + ) + .put( + "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal", + |r| api_handler(r, put_tenant_timeline_import_wal), + ) .any(handler_404)) } diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs new file mode 100644 index 000000000000..7fe8fedc6394 --- /dev/null +++ b/pageserver/src/l0_flush.rs @@ -0,0 +1,46 @@ +use std::{num::NonZeroUsize, sync::Arc}; + +use crate::tenant::ephemeral_file; + +#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)] +#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)] +pub enum L0FlushConfig { + #[default] + PageCached, + #[serde(rename_all = "snake_case")] + Direct { max_concurrency: NonZeroUsize }, +} + +#[derive(Clone)] +pub struct L0FlushGlobalState(Arc); + +pub(crate) enum Inner { + PageCached, + Direct { semaphore: tokio::sync::Semaphore }, +} + +impl L0FlushGlobalState { + pub fn new(config: L0FlushConfig) -> Self { + match config { + L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)), + L0FlushConfig::Direct { max_concurrency } => { + let semaphore = tokio::sync::Semaphore::new(max_concurrency.get()); + Self(Arc::new(Inner::Direct { semaphore })) + } + } + } + + pub(crate) fn inner(&self) -> &Arc { + &self.0 + } +} + +impl L0FlushConfig { + pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite { + use L0FlushConfig::*; + match self { + PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes, + Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No, + } + } +} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 353f97264c5f..ac6b9b4f2a60 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -11,6 +11,7 @@ pub mod deletion_queue; pub mod disk_usage_eviction_task; pub mod http; pub mod import_datadir; +pub mod l0_flush; pub use pageserver_api::keyspace; pub mod aux_file; pub mod metrics; diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index ca697afcf640..e67fa656d02e 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -8,7 +8,7 @@ use metrics::{ }; use once_cell::sync::Lazy; use pageserver_api::shard::TenantShardId; -use strum::{EnumCount, IntoEnumIterator, VariantNames}; +use strum::{EnumCount, VariantNames}; use strum_macros::{EnumVariantNames, IntoStaticStr}; use tracing::warn; use utils::id::TimelineId; @@ -53,9 +53,6 @@ pub(crate) enum StorageTimeOperation { #[strum(serialize = "find gc cutoffs")] FindGcCutoffs, - - #[strum(serialize = "create tenant")] - CreateTenant, } pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy = Lazy::new(|| { @@ -467,6 +464,24 @@ static LAST_RECORD_LSN: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static PITR_HISTORY_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_pitr_history_size", + "Data written since PITR cutoff on this timeline", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + +static TIMELINE_ARCHIVE_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_archive_size", + "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + static STANDBY_HORIZON: Lazy = Lazy::new(|| { register_int_gauge_vec!( "pageserver_standby_horizon", @@ -479,7 +494,7 @@ static STANDBY_HORIZON: Lazy = Lazy::new(|| { static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { register_uint_gauge_vec!( "pageserver_resident_physical_size", - "The size of the layer files present in the pageserver's filesystem.", + "The size of the layer files present in the pageserver's filesystem, for attached locations.", &["tenant_id", "shard_id", "timeline_id"] ) .expect("failed to define a metric") @@ -1079,21 +1094,12 @@ pub(crate) mod virtual_file_io_engine { }); } -#[derive(Debug)] -struct GlobalAndPerTimelineHistogram { - global: Histogram, - per_tenant_timeline: Histogram, -} +struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { + global_metric: &'a Histogram, -impl GlobalAndPerTimelineHistogram { - fn observe(&self, value: f64) { - self.global.observe(value); - self.per_tenant_timeline.observe(value); - } -} + // Optional because not all op types are tracked per-timeline + timeline_metric: Option<&'a Histogram>, -struct GlobalAndPerTimelineHistogramTimer<'a, 'c> { - h: &'a GlobalAndPerTimelineHistogram, ctx: &'c RequestContext, start: std::time::Instant, op: SmgrQueryType, @@ -1124,7 +1130,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> { elapsed } }; - self.h.observe(ex_throttled.as_secs_f64()); + self.global_metric.observe(ex_throttled.as_secs_f64()); + if let Some(timeline_metric) = self.timeline_metric { + timeline_metric.observe(ex_throttled.as_secs_f64()); + } } } @@ -1149,7 +1158,8 @@ pub enum SmgrQueryType { #[derive(Debug)] pub(crate) struct SmgrQueryTimePerTimeline { - metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT], + global_metrics: [Histogram; SmgrQueryType::COUNT], + per_timeline_getpage: Histogram, } static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy = Lazy::new(|| { @@ -1227,27 +1237,32 @@ impl SmgrQueryTimePerTimeline { let tenant_id = tenant_shard_id.tenant_id.to_string(); let shard_slug = format!("{}", tenant_shard_id.shard_slug()); let timeline_id = timeline_id.to_string(); - let metrics = std::array::from_fn(|i| { + let global_metrics = std::array::from_fn(|i| { let op = SmgrQueryType::from_repr(i).unwrap(); - let global = SMGR_QUERY_TIME_GLOBAL + SMGR_QUERY_TIME_GLOBAL .get_metric_with_label_values(&[op.into()]) - .unwrap(); - let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE - .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id]) - .unwrap(); - GlobalAndPerTimelineHistogram { - global, - per_tenant_timeline, - } + .unwrap() }); - Self { metrics } + + let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE + .get_metric_with_label_values(&[ + SmgrQueryType::GetPageAtLsn.into(), + &tenant_id, + &shard_slug, + &timeline_id, + ]) + .unwrap(); + Self { + global_metrics, + per_timeline_getpage, + } } pub(crate) fn start_timer<'c: 'a, 'a>( &'a self, op: SmgrQueryType, ctx: &'c RequestContext, - ) -> impl Drop + '_ { - let metric = &self.metrics[op as usize]; + ) -> Option { + let global_metric = &self.global_metrics[op as usize]; let start = Instant::now(); match ctx.micros_spent_throttled.open() { Ok(()) => (), @@ -1266,12 +1281,20 @@ impl SmgrQueryTimePerTimeline { }); } } - GlobalAndPerTimelineHistogramTimer { - h: metric, + + let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) { + Some(&self.per_timeline_getpage) + } else { + None + }; + + Some(GlobalAndPerTimelineHistogramTimer { + global_metric, + timeline_metric, ctx, start, op, - } + }) } } @@ -1318,17 +1341,9 @@ mod smgr_query_time_tests { let get_counts = || { let global: u64 = ops .iter() - .map(|op| metrics.metrics[*op as usize].global.get_sample_count()) - .sum(); - let per_tenant_timeline: u64 = ops - .iter() - .map(|op| { - metrics.metrics[*op as usize] - .per_tenant_timeline - .get_sample_count() - }) + .map(|op| metrics.global_metrics[*op as usize].get_sample_count()) .sum(); - (global, per_tenant_timeline) + (global, metrics.per_timeline_getpage.get_sample_count()) }; let (pre_global, pre_per_tenant_timeline) = get_counts(); @@ -1339,7 +1354,12 @@ mod smgr_query_time_tests { drop(timer); let (post_global, post_per_tenant_timeline) = get_counts(); - assert_eq!(post_per_tenant_timeline, 1); + if matches!(op, super::SmgrQueryType::GetPageAtLsn) { + // getpage ops are tracked per-timeline, others aren't + assert_eq!(post_per_tenant_timeline, 1); + } else { + assert_eq!(post_per_tenant_timeline, 0); + } assert!(post_global > pre_global); } } @@ -1436,10 +1456,12 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> { } } -pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "pageserver_live_connections", - "Number of live network connections", +pub(crate) static LIVE_CONNECTIONS: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "pageserver_live_connections_started", + "Number of network connections that we started handling", + "pageserver_live_connections_finished", + "Number of network connections that we finished handling", &["pageserver_connection_kind"] ) .expect("failed to define a metric") @@ -1450,10 +1472,7 @@ pub(crate) enum ComputeCommandKind { PageStreamV2, PageStream, Basebackup, - GetLastRecordRlsn, Fullbackup, - ImportBasebackup, - ImportWal, LeaseLsn, Show, } @@ -1694,6 +1713,15 @@ pub(crate) static SECONDARY_MODE: Lazy = Lazy::new(|| { } }); +pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_secondary_resident_physical_size", + "The size of the layer files present in the pageserver's filesystem, for secondary locations.", + &["tenant_id", "shard_id"] + ) + .expect("failed to define a metric") +}); + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub enum RemoteOpKind { Upload, @@ -2096,6 +2124,8 @@ pub(crate) struct TimelineMetrics { pub garbage_collect_histo: StorageTimeMetrics, pub find_gc_cutoffs_histo: StorageTimeMetrics, pub last_record_gauge: IntGauge, + pub pitr_history_size: UIntGauge, + pub archival_size: UIntGauge, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size @@ -2169,6 +2199,15 @@ impl TimelineMetrics { let last_record_gauge = LAST_RECORD_LSN .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + + let pitr_history_size = PITR_HISTORY_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + + let archival_size = TIMELINE_ARCHIVE_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); + let standby_horizon_gauge = STANDBY_HORIZON .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); @@ -2221,6 +2260,8 @@ impl TimelineMetrics { find_gc_cutoffs_histo, load_layer_map_histo, last_record_gauge, + pitr_history_size, + archival_size, standby_horizon_gauge, resident_physical_size_gauge, current_logical_size_gauge, @@ -2278,6 +2319,10 @@ impl TimelineMetrics { if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) { let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); } + + let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); + let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]); @@ -2311,14 +2356,12 @@ impl TimelineMetrics { let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]); } - for op in SmgrQueryType::iter() { - let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[ - op.into(), - tenant_id, - shard_id, - timeline_id, - ]); - } + let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[ + SmgrQueryType::GetPageAtLsn.into(), + tenant_id, + shard_id, + timeline_id, + ]); } } diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs index 6ea5f396d0a7..c10c2f2a0f9a 100644 --- a/pageserver/src/page_service.rs +++ b/pageserver/src/page_service.rs @@ -4,9 +4,7 @@ use anyhow::Context; use async_compression::tokio::write::GzipEncoder; use bytes::Buf; -use bytes::Bytes; use futures::stream::FuturesUnordered; -use futures::Stream; use futures::StreamExt; use pageserver_api::key::Key; use pageserver_api::models::TenantState; @@ -28,7 +26,6 @@ use std::borrow::Cow; use std::collections::HashMap; use std::io; use std::net::TcpListener; -use std::pin::pin; use std::str; use std::str::FromStr; use std::sync::Arc; @@ -37,7 +34,6 @@ use std::time::Instant; use std::time::SystemTime; use tokio::io::AsyncWriteExt; use tokio::io::{AsyncRead, AsyncWrite}; -use tokio_util::io::StreamReader; use tokio_util::sync::CancellationToken; use tracing::*; use utils::id::ConnectionId; @@ -53,9 +49,8 @@ use crate::auth::check_permission; use crate::basebackup; use crate::basebackup::BasebackupError; use crate::context::{DownloadBehavior, RequestContext}; -use crate::import_datadir::import_wal_from_tar; use crate::metrics; -use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT}; +use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS}; use crate::pgdatadir_mapping::Version; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id; use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id; @@ -66,7 +61,6 @@ use crate::tenant::mgr::GetTenantError; use crate::tenant::mgr::ShardResolveResult; use crate::tenant::mgr::ShardSelector; use crate::tenant::mgr::TenantManager; -use crate::tenant::timeline::FlushLayerError; use crate::tenant::timeline::WaitLsnError; use crate::tenant::GetTimelineError; use crate::tenant::PageReconstructError; @@ -82,56 +76,6 @@ use postgres_ffi::BLCKSZ; // is not yet in state [`TenantState::Active`]. const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000); -/// Read the end of a tar archive. -/// -/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each. -/// `tokio_tar` already read the first such block. Read the second all-zeros block, -/// and check that there is no more data after the EOF marker. -/// -/// 'tar' command can also write extra blocks of zeros, up to a record -/// size, controlled by the --record-size argument. Ignore them too. -async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> { - use tokio::io::AsyncReadExt; - let mut buf = [0u8; 512]; - - // Read the all-zeros block, and verify it - let mut total_bytes = 0; - while total_bytes < 512 { - let nbytes = reader.read(&mut buf[total_bytes..]).await?; - total_bytes += nbytes; - if nbytes == 0 { - break; - } - } - if total_bytes < 512 { - anyhow::bail!("incomplete or invalid tar EOF marker"); - } - if !buf.iter().all(|&x| x == 0) { - anyhow::bail!("invalid tar EOF marker"); - } - - // Drain any extra zero-blocks after the EOF marker - let mut trailing_bytes = 0; - let mut seen_nonzero_bytes = false; - loop { - let nbytes = reader.read(&mut buf).await?; - trailing_bytes += nbytes; - if !buf.iter().all(|&x| x == 0) { - seen_nonzero_bytes = true; - } - if nbytes == 0 { - break; - } - } - if seen_nonzero_bytes { - anyhow::bail!("unexpected non-zero bytes after the tar archive"); - } - if trailing_bytes % 512 != 0 { - anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive"); - } - Ok(()) -} - /////////////////////////////////////////////////////////////////////////////// /// @@ -141,7 +85,6 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<() /// pub async fn libpq_listener_main( tenant_manager: Arc, - broker_client: storage_broker::BrokerClientChannel, auth: Option>, listener: TcpListener, auth_type: AuthType, @@ -186,7 +129,6 @@ pub async fn libpq_listener_main( false, page_service_conn_main( tenant_manager.clone(), - broker_client.clone(), local_auth, socket, auth_type, @@ -209,20 +151,14 @@ pub async fn libpq_listener_main( #[instrument(skip_all, fields(peer_addr))] async fn page_service_conn_main( tenant_manager: Arc, - broker_client: storage_broker::BrokerClientChannel, auth: Option>, socket: tokio::net::TcpStream, auth_type: AuthType, connection_ctx: RequestContext, ) -> anyhow::Result<()> { - // Immediately increment the gauge, then create a job to decrement it on task exit. - // One of the pros of `defer!` is that this will *most probably* - // get called, even in presence of panics. - let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]); - gauge.inc(); - scopeguard::defer! { - gauge.dec(); - } + let _guard = LIVE_CONNECTIONS + .with_label_values(&["page_service"]) + .guard(); socket .set_nodelay(true) @@ -267,12 +203,11 @@ async fn page_service_conn_main( // and create a child per-query context when it invokes process_query. // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler // and create the per-query context in process_query ourselves. - let mut conn_handler = - PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx); + let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx); let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?; match pgbackend - .run(&mut conn_handler, task_mgr::shutdown_watcher) + .run(&mut conn_handler, &task_mgr::shutdown_token()) .await { Ok(()) => { @@ -299,7 +234,6 @@ struct HandlerTimeline { } struct PageServerHandler { - broker_client: storage_broker::BrokerClientChannel, auth: Option>, claims: Option, @@ -391,13 +325,11 @@ impl From for QueryError { impl PageServerHandler { pub fn new( tenant_manager: Arc, - broker_client: storage_broker::BrokerClientChannel, auth: Option>, connection_ctx: RequestContext, ) -> Self { PageServerHandler { tenant_manager, - broker_client, auth, claims: None, connection_ctx, @@ -480,73 +412,6 @@ impl PageServerHandler { ) } - fn copyin_stream<'a, IO>( - &'a self, - pgb: &'a mut PostgresBackend, - cancel: &'a CancellationToken, - ) -> impl Stream> + 'a - where - IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, - { - async_stream::try_stream! { - loop { - let msg = tokio::select! { - biased; - - _ = cancel.cancelled() => { - // We were requested to shut down. - let msg = "pageserver is shutting down"; - let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None)); - Err(QueryError::Shutdown) - } - - msg = pgb.read_message() => { msg.map_err(QueryError::from)} - }; - - match msg { - Ok(Some(message)) => { - let copy_data_bytes = match message { - FeMessage::CopyData(bytes) => bytes, - FeMessage::CopyDone => { break }, - FeMessage::Sync => continue, - FeMessage::Terminate => { - let msg = "client terminated connection with Terminate message during COPY"; - let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg))); - // error can't happen here, ErrorResponse serialization should be always ok - pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?; - Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; - break; - } - m => { - let msg = format!("unexpected message {m:?}"); - // error can't happen here, ErrorResponse serialization should be always ok - pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?; - Err(io::Error::new(io::ErrorKind::Other, msg))?; - break; - } - }; - - yield copy_data_bytes; - } - Ok(None) => { - let msg = "client closed connection during COPY"; - let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg))); - // error can't happen here, ErrorResponse serialization should be always ok - pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?; - self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?; - Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?; - } - Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => { - Err(io_error)?; - } - Err(other) => { - Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?; - } - }; - } - } - } - #[instrument(skip_all)] async fn handle_pagerequests( &mut self, @@ -718,128 +583,6 @@ impl PageServerHandler { Ok(()) } - #[allow(clippy::too_many_arguments)] - #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))] - async fn handle_import_basebackup( - &self, - pgb: &mut PostgresBackend, - tenant_id: TenantId, - timeline_id: TimelineId, - base_lsn: Lsn, - _end_lsn: Lsn, - pg_version: u32, - ctx: RequestContext, - ) -> Result<(), QueryError> - where - IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, - { - debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id(); - - // Create empty timeline - info!("creating new timeline"); - let tenant = self - .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT) - .await?; - let timeline = tenant - .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx) - .await?; - - // TODO mark timeline as not ready until it reaches end_lsn. - // We might have some wal to import as well, and we should prevent compute - // from connecting before that and writing conflicting wal. - // - // This is not relevant for pageserver->pageserver migrations, since there's - // no wal to import. But should be fixed if we want to import from postgres. - - // TODO leave clean state on error. For now you can use detach to clean - // up broken state from a failed import. - - // Import basebackup provided via CopyData - info!("importing basebackup"); - pgb.write_message_noflush(&BeMessage::CopyInResponse)?; - self.flush_cancellable(pgb, &tenant.cancel).await?; - - let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel))); - timeline - .import_basebackup_from_tar( - tenant.clone(), - &mut copyin_reader, - base_lsn, - self.broker_client.clone(), - &ctx, - ) - .await?; - - // Read the end of the tar archive. - read_tar_eof(copyin_reader).await?; - - // TODO check checksum - // Meanwhile you can verify client-side by taking fullbackup - // and checking that it matches in size with what was imported. - // It wouldn't work if base came from vanilla postgres though, - // since we discard some log files. - - info!("done"); - Ok(()) - } - - #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))] - async fn handle_import_wal( - &self, - pgb: &mut PostgresBackend, - tenant_id: TenantId, - timeline_id: TimelineId, - start_lsn: Lsn, - end_lsn: Lsn, - ctx: RequestContext, - ) -> Result<(), QueryError> - where - IO: AsyncRead + AsyncWrite + Send + Sync + Unpin, - { - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - let last_record_lsn = timeline.get_last_record_lsn(); - if last_record_lsn != start_lsn { - return Err(QueryError::Other( - anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")) - ); - } - - // TODO leave clean state on error. For now you can use detach to clean - // up broken state from a failed import. - - // Import wal provided via CopyData - info!("importing wal"); - pgb.write_message_noflush(&BeMessage::CopyInResponse)?; - self.flush_cancellable(pgb, &timeline.cancel).await?; - let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel))); - import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?; - info!("wal import complete"); - - // Read the end of the tar archive. - read_tar_eof(copyin_reader).await?; - - // TODO Does it make sense to overshoot? - if timeline.get_last_record_lsn() < end_lsn { - return Err(QueryError::Other( - anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")) - ); - } - - // Flush data to disk, then upload to s3. No need for a forced checkpoint. - // We only want to persist the data, and it doesn't matter if it's in the - // shape of deltas or images. - info!("flushing layers"); - timeline.freeze_and_flush().await.map_err(|e| match e { - FlushLayerError::Cancelled => QueryError::Shutdown, - other => QueryError::Other(other.into()), - })?; - - info!("done"); - Ok(()) - } - /// Helper function to handle the LSN from client request. /// /// Each GetPage (and Exists and Nblocks) request includes information about @@ -1656,53 +1399,6 @@ where metric_recording.observe(&res); res?; } - // return pair of prev_lsn and last_lsn - else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) { - if params.len() != 2 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for get_last_record_rlsn command" - ))); - } - - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - let timeline_id = TimelineId::from_str(params[1]) - .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; - - tracing::Span::current() - .record("tenant_id", field::display(tenant_id)) - .record("timeline_id", field::display(timeline_id)); - - self.check_permission(Some(tenant_id))?; - - COMPUTE_COMMANDS_COUNTERS - .for_command(ComputeCommandKind::GetLastRecordRlsn) - .inc(); - - async { - let timeline = self - .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero) - .await?; - - let end_of_timeline = timeline.get_last_record_rlsn(); - - pgb.write_message_noflush(&BeMessage::RowDescription(&[ - RowDescriptor::text_col(b"prev_lsn"), - RowDescriptor::text_col(b"last_lsn"), - ]))? - .write_message_noflush(&BeMessage::DataRow(&[ - Some(end_of_timeline.prev.to_string().as_bytes()), - Some(end_of_timeline.last.to_string().as_bytes()), - ]))? - .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - anyhow::Ok(()) - } - .instrument(info_span!( - "handle_get_last_record_lsn", - shard_id = tracing::field::Empty - )) - .await?; - } // same as basebackup, but result includes relational data as well else if let Some(params) = parts.strip_prefix(&["fullbackup"]) { if params.len() < 2 { @@ -1757,109 +1453,6 @@ where ) .await?; pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?; - } else if query_string.starts_with("import basebackup ") { - // Import the `base` section (everything but the wal) of a basebackup. - // Assumes the tenant already exists on this pageserver. - // - // Files are scheduled to be persisted to remote storage, and the - // caller should poll the http api to check when that is done. - // - // Example import command: - // 1. Get start/end LSN from backup_manifest file - // 2. Run: - // cat my_backup/base.tar | psql -h $PAGESERVER \ - // -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION" - let params = &parts[2..]; - if params.len() != 5 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for import basebackup command" - ))); - } - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - let timeline_id = TimelineId::from_str(params[1]) - .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; - let base_lsn = Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; - let end_lsn = Lsn::from_str(params[3]) - .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?; - let pg_version = u32::from_str(params[4]) - .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?; - - tracing::Span::current() - .record("tenant_id", field::display(tenant_id)) - .record("timeline_id", field::display(timeline_id)); - - self.check_permission(Some(tenant_id))?; - - COMPUTE_COMMANDS_COUNTERS - .for_command(ComputeCommandKind::ImportBasebackup) - .inc(); - - match self - .handle_import_basebackup( - pgb, - tenant_id, - timeline_id, - base_lsn, - end_lsn, - pg_version, - ctx, - ) - .await - { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, - Err(e) => { - error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse( - &e.to_string(), - Some(e.pg_error_code()), - ))? - } - }; - } else if query_string.starts_with("import wal ") { - // Import the `pg_wal` section of a basebackup. - // - // Files are scheduled to be persisted to remote storage, and the - // caller should poll the http api to check when that is done. - let params = &parts[2..]; - if params.len() != 4 { - return Err(QueryError::Other(anyhow::anyhow!( - "invalid param number for import wal command" - ))); - } - let tenant_id = TenantId::from_str(params[0]) - .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?; - let timeline_id = TimelineId::from_str(params[1]) - .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?; - let start_lsn = Lsn::from_str(params[2]) - .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?; - let end_lsn = Lsn::from_str(params[3]) - .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?; - - tracing::Span::current() - .record("tenant_id", field::display(tenant_id)) - .record("timeline_id", field::display(timeline_id)); - - self.check_permission(Some(tenant_id))?; - - COMPUTE_COMMANDS_COUNTERS - .for_command(ComputeCommandKind::ImportWal) - .inc(); - - match self - .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx) - .await - { - Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?, - Err(e) => { - error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}"); - pgb.write_message_noflush(&BeMessage::ErrorResponse( - &e.to_string(), - Some(e.pg_error_code()), - ))? - } - }; } else if query_string.to_ascii_lowercase().starts_with("set ") { // important because psycopg2 executes "SET datestyle TO 'ISO'" // on connect diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs index 25d00d6dfd0d..8a6cfea92b3b 100644 --- a/pageserver/src/pgdatadir_mapping.rs +++ b/pageserver/src/pgdatadir_mapping.rs @@ -854,13 +854,14 @@ impl Timeline { result.add_key(DBDIR_KEY); // Fetch list of database dirs and iterate them - let buf = self.get(DBDIR_KEY, lsn, ctx).await?; - let dbdir = DbDirectory::des(&buf)?; + let dbdir = self.list_dbdirs(lsn, ctx).await?; + let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect(); - let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect(); - dbs.sort_unstable(); - for (spcnode, dbnode) in dbs { - result.add_key(relmap_file_key(spcnode, dbnode)); + dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b)); + for ((spcnode, dbnode), has_relmap_file) in dbs { + if has_relmap_file { + result.add_key(relmap_file_key(spcnode, dbnode)); + } result.add_key(rel_dir_to_key(spcnode, dbnode)); let mut rels: Vec = self @@ -919,6 +920,9 @@ impl Timeline { result.add_key(AUX_FILES_KEY); } + // Add extra keyspaces in the test cases. Some test cases write keys into the storage without + // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace` + // and the keys will not be garbage-colllected. #[cfg(test)] { let guard = self.extra_test_dense_keyspace.load(); @@ -927,13 +931,48 @@ impl Timeline { } } - Ok(( - result.to_keyspace(), - /* AUX sparse key space */ - SparseKeySpace(KeySpace { - ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()], - }), - )) + let dense_keyspace = result.to_keyspace(); + let sparse_keyspace = SparseKeySpace(KeySpace { + ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()], + }); + + if cfg!(debug_assertions) { + // Verify if the sparse keyspaces are ordered and non-overlapping. + + // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each + // category of sparse keys are split into their own image/delta files. If there + // are overlapping keyspaces, they will be automatically merged by keyspace accum, + // and we want the developer to keep the keyspaces separated. + + let ranges = &sparse_keyspace.0.ranges; + + // TODO: use a single overlaps_with across the codebase + fn overlaps_with(a: &Range, b: &Range) -> bool { + !(a.end <= b.start || b.end <= a.start) + } + for i in 0..ranges.len() { + for j in 0..i { + if overlaps_with(&ranges[i], &ranges[j]) { + panic!( + "overlapping sparse keyspace: {}..{} and {}..{}", + ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end + ); + } + } + } + for i in 1..ranges.len() { + assert!( + ranges[i - 1].end <= ranges[i].start, + "unordered sparse keyspace: {}..{} and {}..{}", + ranges[i - 1].start, + ranges[i - 1].end, + ranges[i].start, + ranges[i].end + ); + } + } + + Ok((dense_keyspace, sparse_keyspace)) } /// Get cached size of relation if it not updated after specified LSN diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 45e542a3367c..eef8dc104c69 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -73,6 +73,7 @@ use crate::deletion_queue::DeletionQueueClient; use crate::deletion_queue::DeletionQueueError; use crate::import_datadir; use crate::is_uninit_mark; +use crate::l0_flush::L0FlushGlobalState; use crate::metrics::TENANT; use crate::metrics::{ remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC, @@ -88,6 +89,7 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; +use crate::walredo; use crate::InitializationOrder; use std::collections::hash_map::Entry; use std::collections::BTreeSet; @@ -165,6 +167,7 @@ pub struct TenantSharedResources { pub broker_client: storage_broker::BrokerClientChannel, pub remote_storage: GenericRemoteStorage, pub deletion_queue_client: DeletionQueueClient, + pub l0_flush_global_state: L0FlushGlobalState, } /// A [`Tenant`] is really an _attached_ tenant. The configuration @@ -212,8 +215,6 @@ pub(crate) enum SpawnMode { Eager, /// Lazy activation in the background, with the option to skip the queue if the need comes up Lazy, - /// Tenant has been created during the lifetime of this process - Create, } /// @@ -295,6 +296,8 @@ pub struct Tenant { /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline. ongoing_timeline_detach: std::sync::Mutex>, + + l0_flush_global_state: L0FlushGlobalState, } impl std::fmt::Debug for Tenant { @@ -323,6 +326,16 @@ impl From for WalRedoManager { } impl WalRedoManager { + pub(crate) async fn shutdown(&self) { + match self { + Self::Prod(mgr) => mgr.shutdown().await, + #[cfg(test)] + Self::Test(_) => { + // Not applicable to test redo manager + } + } + } + pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) { match self { Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout), @@ -343,7 +356,7 @@ impl WalRedoManager { base_img: Option<(Lsn, bytes::Bytes)>, records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>, pg_version: u32, - ) -> anyhow::Result { + ) -> Result { match self { Self::Prod(mgr) => { mgr.request_redo(key, lsn, base_img, records, pg_version) @@ -520,6 +533,15 @@ impl From for GcError { } } +#[derive(thiserror::Error, Debug)] +pub(crate) enum LoadConfigError { + #[error("TOML deserialization error: '{0}'")] + DeserializeToml(#[from] toml_edit::de::Error), + + #[error("Config not found at {0}")] + NotFound(Utf8PathBuf), +} + impl Tenant { /// Yet another helper for timeline initialization. /// @@ -658,6 +680,7 @@ impl Tenant { broker_client, remote_storage, deletion_queue_client, + l0_flush_global_state, } = resources; let attach_mode = attached_conf.location.attach_mode; @@ -672,6 +695,7 @@ impl Tenant { tenant_shard_id, remote_storage.clone(), deletion_queue_client, + l0_flush_global_state, )); // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if @@ -797,9 +821,6 @@ impl Tenant { }; let preload = match &mode { - SpawnMode::Create => { - None - }, SpawnMode::Eager | SpawnMode::Lazy => { let _preload_timer = TENANT.preload.start_timer(); let res = tenant_clone @@ -821,11 +842,8 @@ impl Tenant { // We will time the duration of the attach phase unless this is a creation (attach will do no work) let attached = { - let _attach_timer = match mode { - SpawnMode::Create => None, - SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()), - }; - tenant_clone.attach(preload, mode, &ctx).await + let _attach_timer = Some(TENANT.attach.start_timer()); + tenant_clone.attach(preload, &ctx).await }; match attached { @@ -901,21 +919,14 @@ impl Tenant { async fn attach( self: &Arc, preload: Option, - mode: SpawnMode, ctx: &RequestContext, ) -> anyhow::Result<()> { span::debug_assert_current_span_has_tenant_id(); failpoint_support::sleep_millis_async!("before-attaching-tenant"); - let preload = match (preload, mode) { - (Some(p), _) => p, - (None, SpawnMode::Create) => TenantPreload { - timelines: HashMap::new(), - }, - (None, _) => { - anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); - } + let Some(preload) = preload else { + anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624"); }; let mut timelines_to_resume_deletions = vec![]; @@ -984,6 +995,7 @@ impl Tenant { TimelineResources { remote_client, timeline_get_throttle: self.timeline_get_throttle.clone(), + l0_flush_global_state: self.l0_flush_global_state.clone(), }, ctx, ) @@ -1353,7 +1365,7 @@ impl Tenant { initdb_lsn: Lsn, pg_version: u32, ctx: &RequestContext, - delta_layer_desc: Vec>, + delta_layer_desc: Vec, image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, end_lsn: Lsn, ) -> anyhow::Result> { @@ -1804,9 +1816,15 @@ impl Tenant { // If we're still attaching, fire the cancellation token early to drop out: this // will prevent us flushing, but ensures timely shutdown if some I/O during attach // is very slow. - if matches!(self.current_state(), TenantState::Attaching) { + let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) { self.cancel.cancel(); - } + + // Having fired our cancellation token, do not try and flush timelines: their cancellation tokens + // are children of ours, so their flush loops will have shut down already + timeline::ShutdownMode::Hard + } else { + shutdown_mode + }; match self.set_stopping(shutdown_progress, false, false).await { Ok(()) => {} @@ -1853,6 +1871,10 @@ impl Tenant { tracing::debug!("Waiting for tasks..."); task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await; + if let Some(walredo_mgr) = self.walredo_mgr.as_ref() { + walredo_mgr.shutdown().await; + } + // Wait for any in-flight operations to complete self.gate.close().await; @@ -2469,6 +2491,7 @@ impl Tenant { tenant_shard_id: TenantShardId, remote_storage: GenericRemoteStorage, deletion_queue_client: DeletionQueueClient, + l0_flush_global_state: L0FlushGlobalState, ) -> Tenant { debug_assert!( !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none() @@ -2556,6 +2579,7 @@ impl Tenant { )), tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)), ongoing_timeline_detach: std::sync::Mutex::default(), + l0_flush_global_state, } } @@ -2563,36 +2587,35 @@ impl Tenant { pub(super) fn load_tenant_config( conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, - ) -> anyhow::Result { + ) -> Result { let config_path = conf.tenant_location_config_path(tenant_shard_id); - if config_path.exists() { - // New-style config takes precedence - let deserialized = Self::read_config(&config_path)?; - Ok(toml_edit::de::from_document::(deserialized)?) - } else { - // The config should almost always exist for a tenant directory: - // - When attaching a tenant, the config is the first thing we write - // - When detaching a tenant, we atomically move the directory to a tmp location - // before deleting contents. - // - // The very rare edge case that can result in a missing config is if we crash during attach - // between creating directory and writing config. Callers should handle that as if the - // directory didn't exist. - anyhow::bail!("tenant config not found in {}", config_path); - } - } - - fn read_config(path: &Utf8Path) -> anyhow::Result { - info!("loading tenant configuration from {path}"); + info!("loading tenant configuration from {config_path}"); // load and parse file - let config = fs::read_to_string(path) - .with_context(|| format!("Failed to load config from path '{path}'"))?; + let config = fs::read_to_string(&config_path).map_err(|e| { + match e.kind() { + std::io::ErrorKind::NotFound => { + // The config should almost always exist for a tenant directory: + // - When attaching a tenant, the config is the first thing we write + // - When detaching a tenant, we atomically move the directory to a tmp location + // before deleting contents. + // + // The very rare edge case that can result in a missing config is if we crash during attach + // between creating directory and writing config. Callers should handle that as if the + // directory didn't exist. + + LoadConfigError::NotFound(config_path) + } + _ => { + // No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues + // that we cannot cleanly recover + crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file") + } + } + })?; - config - .parse::() - .with_context(|| format!("Failed to parse config from file '{path}' as toml file")) + Ok(toml_edit::de::from_str::(&config)?) } #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))] @@ -2600,7 +2623,7 @@ impl Tenant { conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, location_conf: &LocationConf, - ) -> anyhow::Result<()> { + ) -> std::io::Result<()> { let config_path = conf.tenant_location_config_path(tenant_shard_id); Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await @@ -2611,7 +2634,7 @@ impl Tenant { tenant_shard_id: &TenantShardId, config_path: &Utf8Path, location_conf: &LocationConf, - ) -> anyhow::Result<()> { + ) -> std::io::Result<()> { debug!("persisting tenantconf to {config_path}"); let mut conf_content = r#"# This file contains a specific per-tenant's config. @@ -2620,22 +2643,20 @@ impl Tenant { .to_string(); fail::fail_point!("tenant-config-before-write", |_| { - anyhow::bail!("tenant-config-before-write"); + Err(std::io::Error::new( + std::io::ErrorKind::Other, + "tenant-config-before-write", + )) }); // Convert the config to a toml file. - conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?; + conf_content += + &toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed"); let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX); - let tenant_shard_id = *tenant_shard_id; - let config_path = config_path.to_owned(); let conf_content = conf_content.into_bytes(); - VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content) - .await - .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?; - - Ok(()) + VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await } // @@ -2853,6 +2874,7 @@ impl Tenant { { let mut target = timeline.gc_info.write().unwrap(); + // Cull any expired leases let now = SystemTime::now(); target.leases.retain(|_, lease| !lease.is_expired(&now)); @@ -2861,6 +2883,31 @@ impl Tenant { .valid_lsn_lease_count_gauge .set(target.leases.len() as u64); + // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR + if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() { + if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) { + target.within_ancestor_pitr = + timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr; + } + } + + // Update metrics that depend on GC state + timeline + .metrics + .archival_size + .set(if target.within_ancestor_pitr { + timeline.metrics.current_logical_size_gauge.get() + } else { + 0 + }); + timeline.metrics.pitr_history_size.set( + timeline + .get_last_record_lsn() + .checked_sub(target.cutoffs.pitr) + .unwrap_or(Lsn(0)) + .0, + ); + match gc_cutoffs.remove(&timeline.timeline_id) { Some(cutoffs) => { target.retain_lsns = branchpoints; @@ -2912,7 +2959,7 @@ impl Tenant { dst_id: TimelineId, ancestor_lsn: Option, ctx: &RequestContext, - delta_layer_desc: Vec>, + delta_layer_desc: Vec, image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>, end_lsn: Lsn, ) -> anyhow::Result> { @@ -3296,6 +3343,7 @@ impl Tenant { TimelineResources { remote_client, timeline_get_throttle: self.timeline_get_throttle.clone(), + l0_flush_global_state: self.l0_flush_global_state.clone(), } } @@ -3632,6 +3680,7 @@ pub(crate) mod harness { use utils::logging; use crate::deletion_queue::mock::MockDeletionQueue; + use crate::l0_flush::L0FlushConfig; use crate::walredo::apply_neon; use crate::{repository::Key, walrecord::NeonWalRecord}; @@ -3821,12 +3870,14 @@ pub(crate) mod harness { self.tenant_shard_id, self.remote_storage.clone(), self.deletion_queue.new_client(), + // TODO: ideally we should run all unit tests with both configs + L0FlushGlobalState::new(L0FlushConfig::default()), )); let preload = tenant .preload(&self.remote_storage, CancellationToken::new()) .await?; - tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?; + tenant.attach(Some(preload), ctx).await?; tenant.state.send_replace(TenantState::Active); for timeline in tenant.timelines.lock().unwrap().values() { @@ -3854,7 +3905,7 @@ pub(crate) mod harness { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, _pg_version: u32, - ) -> anyhow::Result { + ) -> Result { let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1)); if records_neon { // For Neon wal records, we can decode without spawning postgres, so do so. @@ -3908,7 +3959,7 @@ mod tests { use storage_layer::PersistentLayerKey; use tests::storage_layer::ValuesReconstructState; use tests::timeline::{GetVectoredError, ShutdownMode}; - use timeline::GcInfo; + use timeline::{DeltaLayerTestDesc, GcInfo}; use utils::bin_ser::BeSer; use utils::id::TenantId; @@ -6204,27 +6255,6 @@ mod tests { .await .unwrap(); - async fn get_vectored_impl_wrapper( - tline: &Arc, - key: Key, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result, GetVectoredError> { - let mut reconstruct_state = ValuesReconstructState::new(); - let mut res = tline - .get_vectored_impl( - KeySpace::single(key..key.next()), - lsn, - &mut reconstruct_state, - ctx, - ) - .await?; - Ok(res.pop_last().map(|(k, v)| { - assert_eq!(k, key); - v.unwrap() - })) - } - let lsn = Lsn(0x30); // test vectored get on parent timeline @@ -6264,7 +6294,7 @@ mod tests { #[tokio::test] async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> { - let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?; + let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?; let (tenant, ctx) = harness.load().await; let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap(); @@ -6300,27 +6330,6 @@ mod tests { .await .unwrap(); - async fn get_vectored_impl_wrapper( - tline: &Arc, - key: Key, - lsn: Lsn, - ctx: &RequestContext, - ) -> Result, GetVectoredError> { - let mut reconstruct_state = ValuesReconstructState::new(); - let mut res = tline - .get_vectored_impl( - KeySpace::single(key..key.next()), - lsn, - &mut reconstruct_state, - ctx, - ) - .await?; - Ok(res.pop_last().map(|(k, v)| { - assert_eq!(k, key); - v.unwrap() - })) - } - let lsn = Lsn(0x30); // test vectored get on parent timeline @@ -6396,9 +6405,18 @@ mod tests { &ctx, // delta layers vec![ - vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], - vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], - vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ), ], // image layers vec![ @@ -6464,17 +6482,29 @@ mod tests { &ctx, // delta layers vec![ - vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], - vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], - vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], - vec![ - (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))), - (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))), - ], + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x30)..Lsn(0x40), + vec![ + (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))), + (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))), + ], + ), ], // image layers vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], - Lsn(0x30), + Lsn(0x40), ) .await .unwrap(); @@ -6497,7 +6527,7 @@ mod tests { // Image layers are created at last_record_lsn let images = tline - .inspect_image_layers(Lsn(0x30), &ctx) + .inspect_image_layers(Lsn(0x40), &ctx) .await .unwrap() .into_iter() @@ -6523,9 +6553,18 @@ mod tests { &ctx, // delta layers vec![ - vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], - vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], - vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x20), + vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))], + ), + DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x20)..Lsn(0x30), + vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))], + ), ], // image layers vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])], @@ -6573,15 +6612,21 @@ mod tests { key } - // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon. + // We create + // - one bottom-most image layer, + // - a delta layer D1 crossing the GC horizon with data below and above the horizon, + // - a delta layer D2 crossing the GC horizon with data only below the horizon, + // - a delta layer D3 above the horizon. // - // | D1 | | D3 | + // | D3 | + // | D1 | // -| |-- gc horizon ----------------- // | | | D2 | // --------- img layer ------------------ // // What we should expact from this compaction is: - // | Part of D1 | | D3 | + // | D3 | + // | Part of D1 | // --------- img layer with D1+D2 at GC horizon------------------ // img layer at 0x10 @@ -6621,13 +6666,13 @@ mod tests { let delta3 = vec![ ( get_key(8), - Lsn(0x40), - Value::Image(Bytes::from("value 8@0x40")), + Lsn(0x48), + Value::Image(Bytes::from("value 8@0x48")), ), ( get_key(9), - Lsn(0x40), - Value::Image(Bytes::from("value 9@0x40")), + Lsn(0x48), + Value::Image(Bytes::from("value 9@0x48")), ), ]; @@ -6637,7 +6682,11 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, - vec![delta1, delta2, delta3], // delta layers + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers vec![(Lsn(0x10), img_layer)], // image layers Lsn(0x50), ) @@ -6658,8 +6707,8 @@ mod tests { Bytes::from_static(b"value 5@0x20"), Bytes::from_static(b"value 6@0x20"), Bytes::from_static(b"value 7@0x10"), - Bytes::from_static(b"value 8@0x40"), - Bytes::from_static(b"value 9@0x40"), + Bytes::from_static(b"value 8@0x48"), + Bytes::from_static(b"value 9@0x48"), ]; for (idx, expected) in expected_result.iter().enumerate() { @@ -6747,10 +6796,10 @@ mod tests { lsn_range: Lsn(0x30)..Lsn(0x41), is_delta: true }, - // The delta layer we created and should not be picked for the compaction + // The delta3 layer that should not be picked for the compaction PersistentLayerKey { key_range: get_key(8)..get_key(10), - lsn_range: Lsn(0x40)..Lsn(0x41), + lsn_range: Lsn(0x48)..Lsn(0x50), is_delta: true } ] @@ -6814,7 +6863,10 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, - vec![delta1], // delta layers + vec![DeltaLayerTestDesc::new_with_inferred_key_range( + Lsn(0x10)..Lsn(0x40), + delta1, + )], // delta layers vec![(Lsn(0x10), image1)], // image layers Lsn(0x50), ) @@ -6938,15 +6990,21 @@ mod tests { key } - // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon. + // We create + // - one bottom-most image layer, + // - a delta layer D1 crossing the GC horizon with data below and above the horizon, + // - a delta layer D2 crossing the GC horizon with data only below the horizon, + // - a delta layer D3 above the horizon. // - // | D1 | | D3 | + // | D3 | + // | D1 | // -| |-- gc horizon ----------------- // | | | D2 | // --------- img layer ------------------ // // What we should expact from this compaction is: - // | Part of D1 | | D3 | + // | D3 | + // | Part of D1 | // --------- img layer with D1+D2 at GC horizon------------------ // img layer at 0x10 @@ -6996,13 +7054,13 @@ mod tests { let delta3 = vec![ ( get_key(8), - Lsn(0x40), - Value::WalRecord(NeonWalRecord::wal_append("@0x40")), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ( get_key(9), - Lsn(0x40), - Value::WalRecord(NeonWalRecord::wal_append("@0x40")), + Lsn(0x48), + Value::WalRecord(NeonWalRecord::wal_append("@0x48")), ), ]; @@ -7012,7 +7070,11 @@ mod tests { Lsn(0x10), DEFAULT_PG_VERSION, &ctx, - vec![delta1, delta2, delta3], // delta layers + vec![ + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2), + DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3), + ], // delta layers vec![(Lsn(0x10), img_layer)], // image layers Lsn(0x50), ) @@ -7027,6 +7089,7 @@ mod tests { horizon: Lsn(0x30), }, leases: Default::default(), + within_ancestor_pitr: false, }; } @@ -7039,8 +7102,8 @@ mod tests { Bytes::from_static(b"value 5@0x10@0x20"), Bytes::from_static(b"value 6@0x10@0x20"), Bytes::from_static(b"value 7@0x10"), - Bytes::from_static(b"value 8@0x10@0x40"), - Bytes::from_static(b"value 9@0x10@0x40"), + Bytes::from_static(b"value 8@0x10@0x48"), + Bytes::from_static(b"value 9@0x10@0x48"), ]; let expected_result_at_gc_horizon = [ diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 2be8816cefbd..e98ed66ef998 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -6,13 +6,20 @@ //! is written as a one byte. If it's larger than that, the length //! is written as a four-byte integer, in big-endian, with the high //! bit set. This way, we can detect whether it's 1- or 4-byte header -//! by peeking at the first byte. +//! by peeking at the first byte. For blobs larger than 128 bits, +//! we also specify three reserved bits, only one of the three bit +//! patterns is currently in use (0b011) and signifies compression +//! with zstd. //! //! len < 128: 0XXXXXXX -//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX +//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX //! +use async_compression::Level; use bytes::{BufMut, BytesMut}; +use pageserver_api::models::ImageCompressionAlgorithm; +use tokio::io::AsyncWriteExt; use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice}; +use tracing::warn; use crate::context::RequestContext; use crate::page_cache::PAGE_SZ; @@ -66,12 +73,37 @@ impl<'a> BlockCursor<'a> { len_buf.copy_from_slice(&buf[off..off + 4]); off += 4; } - len_buf[0] &= 0x7f; + let bit_mask = if self.read_compressed { + !LEN_COMPRESSION_BIT_MASK + } else { + 0x7f + }; + len_buf[0] &= bit_mask; u32::from_be_bytes(len_buf) as usize }; + let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK; - dstbuf.clear(); - dstbuf.reserve(len); + let mut tmp_buf = Vec::new(); + let buf_to_write; + let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed { + if compression_bits > BYTE_UNCOMPRESSED { + warn!("reading key above future limit ({len} bytes)"); + } + buf_to_write = dstbuf; + None + } else if compression_bits == BYTE_ZSTD { + buf_to_write = &mut tmp_buf; + Some(dstbuf) + } else { + let error = std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("invalid compression byte {compression_bits:x}"), + ); + return Err(error); + }; + + buf_to_write.clear(); + buf_to_write.reserve(len); // Read the payload let mut remain = len; @@ -85,14 +117,35 @@ impl<'a> BlockCursor<'a> { page_remain = PAGE_SZ; } let this_blk_len = min(remain, page_remain); - dstbuf.extend_from_slice(&buf[off..off + this_blk_len]); + buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]); remain -= this_blk_len; off += this_blk_len; } + + if let Some(dstbuf) = compression { + if compression_bits == BYTE_ZSTD { + let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf); + decoder.write_all(buf_to_write).await?; + decoder.flush().await?; + } else { + unreachable!("already checked above") + } + } + Ok(()) } } +/// Reserved bits for length and compression +const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0; + +/// The maximum size of blobs we support. The highest few bits +/// are reserved for compression and other further uses. +const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff; + +const BYTE_UNCOMPRESSED: u8 = 0x80; +const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10; + /// A wrapper of `VirtualFile` that allows users to write blobs. /// /// If a `BlobWriter` is dropped, the internal buffer will be @@ -219,6 +272,18 @@ impl BlobWriter { &mut self, srcbuf: B, ctx: &RequestContext, + ) -> (B::Buf, Result) { + self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled) + .await + } + + /// Write a blob of data. Returns the offset that it was written to, + /// which can be used to retrieve the data later. + pub async fn write_blob_maybe_compressed, Buf: IoBuf + Send>( + &mut self, + srcbuf: B, + ctx: &RequestContext, + algorithm: ImageCompressionAlgorithm, ) -> (B::Buf, Result) { let offset = self.offset; @@ -226,29 +291,60 @@ impl BlobWriter { let mut io_buf = self.io_buf.take().expect("we always put it back below"); io_buf.clear(); - let (io_buf, hdr_res) = async { + let mut compressed_buf = None; + let ((io_buf, hdr_res), srcbuf) = async { if len < 128 { // Short blob. Write a 1-byte length header io_buf.put_u8(len as u8); - self.write_all(io_buf, ctx).await + ( + self.write_all(io_buf, ctx).await, + srcbuf.slice_full().into_inner(), + ) } else { // Write a 4-byte length header - if len > 0x7fff_ffff { + if len > MAX_SUPPORTED_LEN { return ( - io_buf, - Err(Error::new( - ErrorKind::Other, - format!("blob too large ({len} bytes)"), - )), + ( + io_buf, + Err(Error::new( + ErrorKind::Other, + format!("blob too large ({len} bytes)"), + )), + ), + srcbuf.slice_full().into_inner(), ); } - if len > 0x0fff_ffff { - tracing::warn!("writing blob above future limit ({len} bytes)"); - } - let mut len_buf = (len as u32).to_be_bytes(); - len_buf[0] |= 0x80; + let (high_bit_mask, len_written, srcbuf) = match algorithm { + ImageCompressionAlgorithm::Zstd { level } => { + let mut encoder = if let Some(level) = level { + async_compression::tokio::write::ZstdEncoder::with_quality( + Vec::new(), + Level::Precise(level.into()), + ) + } else { + async_compression::tokio::write::ZstdEncoder::new(Vec::new()) + }; + let slice = srcbuf.slice_full(); + encoder.write_all(&slice[..]).await.unwrap(); + encoder.shutdown().await.unwrap(); + let compressed = encoder.into_inner(); + if compressed.len() < len { + let compressed_len = compressed.len(); + compressed_buf = Some(compressed); + (BYTE_ZSTD, compressed_len, slice.into_inner()) + } else { + (BYTE_UNCOMPRESSED, len, slice.into_inner()) + } + } + ImageCompressionAlgorithm::Disabled => { + (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner()) + } + }; + let mut len_buf = (len_written as u32).to_be_bytes(); + assert_eq!(len_buf[0] & 0xf0, 0); + len_buf[0] |= high_bit_mask; io_buf.extend_from_slice(&len_buf[..]); - self.write_all(io_buf, ctx).await + (self.write_all(io_buf, ctx).await, srcbuf) } } .await; @@ -257,7 +353,12 @@ impl BlobWriter { Ok(_) => (), Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)), } - let (srcbuf, res) = self.write_all(srcbuf, ctx).await; + let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf { + let (_buf, res) = self.write_all(compressed_buf, ctx).await; + (Slice::into_inner(srcbuf.slice(..)), res) + } else { + self.write_all(srcbuf, ctx).await + }; (srcbuf, res.map(|_| offset)) } } @@ -295,6 +396,13 @@ mod tests { use rand::{Rng, SeedableRng}; async fn round_trip_test(blobs: &[Vec]) -> Result<(), Error> { + round_trip_test_compressed::(blobs, false).await + } + + async fn round_trip_test_compressed( + blobs: &[Vec], + compression: bool, + ) -> Result<(), Error> { let temp_dir = camino_tempfile::tempdir()?; let pathbuf = temp_dir.path().join("file"); let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error); @@ -305,7 +413,16 @@ mod tests { let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?; let mut wtr = BlobWriter::::new(file, 0); for blob in blobs.iter() { - let (_, res) = wtr.write_blob(blob.clone(), &ctx).await; + let (_, res) = if compression { + wtr.write_blob_maybe_compressed( + blob.clone(), + &ctx, + ImageCompressionAlgorithm::Zstd { level: Some(1) }, + ) + .await + } else { + wtr.write_blob(blob.clone(), &ctx).await + }; let offs = res?; offsets.push(offs); } @@ -319,7 +436,7 @@ mod tests { let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?; let rdr = BlockReaderRef::VirtualFile(&file); - let rdr = BlockCursor::new(rdr); + let rdr = BlockCursor::new_with_compression(rdr, compression); for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() { let blob_read = rdr.read_blob(*offset, &ctx).await?; assert_eq!( @@ -353,6 +470,8 @@ mod tests { ]; round_trip_test::(blobs).await?; round_trip_test::(blobs).await?; + round_trip_test_compressed::(blobs, true).await?; + round_trip_test_compressed::(blobs, true).await?; Ok(()) } @@ -361,10 +480,15 @@ mod tests { let blobs = &[ b"test".to_vec(), random_array(10 * PAGE_SZ), + b"hello".to_vec(), + random_array(66 * PAGE_SZ), + vec![0xf3; 24 * PAGE_SZ], b"foobar".to_vec(), ]; round_trip_test::(blobs).await?; round_trip_test::(blobs).await?; + round_trip_test_compressed::(blobs, true).await?; + round_trip_test_compressed::(blobs, true).await?; Ok(()) } diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index b406d5033243..601b09515519 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -37,6 +37,7 @@ where pub enum BlockLease<'a> { PageReadGuard(PageReadGuard<'static>), EphemeralFileMutableTail(&'a [u8; PAGE_SZ]), + Slice(&'a [u8; PAGE_SZ]), #[cfg(test)] Arc(std::sync::Arc<[u8; PAGE_SZ]>), #[cfg(test)] @@ -63,6 +64,7 @@ impl<'a> Deref for BlockLease<'a> { match self { BlockLease::PageReadGuard(v) => v.deref(), BlockLease::EphemeralFileMutableTail(v) => v, + BlockLease::Slice(v) => v, #[cfg(test)] BlockLease::Arc(v) => v.deref(), #[cfg(test)] @@ -81,6 +83,7 @@ pub(crate) enum BlockReaderRef<'a> { FileBlockReader(&'a FileBlockReader<'a>), EphemeralFile(&'a EphemeralFile), Adapter(Adapter<&'a DeltaLayerInner>), + Slice(&'a [u8]), #[cfg(test)] TestDisk(&'a super::disk_btree::tests::TestDisk), #[cfg(test)] @@ -99,6 +102,7 @@ impl<'a> BlockReaderRef<'a> { FileBlockReader(r) => r.read_blk(blknum, ctx).await, EphemeralFile(r) => r.read_blk(blknum, ctx).await, Adapter(r) => r.read_blk(blknum, ctx).await, + Slice(s) => Self::read_blk_slice(s, blknum), #[cfg(test)] TestDisk(r) => r.read_blk(blknum), #[cfg(test)] @@ -107,6 +111,24 @@ impl<'a> BlockReaderRef<'a> { } } +impl<'a> BlockReaderRef<'a> { + fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result { + let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap(); + let end = start.checked_add(PAGE_SZ).unwrap(); + if end > slice.len() { + return Err(std::io::Error::new( + std::io::ErrorKind::UnexpectedEof, + format!("slice too short, len={} end={}", slice.len(), end), + )); + } + let slice = &slice[start..end]; + let page_sized: &[u8; PAGE_SZ] = slice + .try_into() + .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ"); + Ok(BlockLease::Slice(page_sized)) + } +} + /// /// A "cursor" for efficiently reading multiple pages from a BlockReader /// @@ -127,16 +149,24 @@ impl<'a> BlockReaderRef<'a> { /// ``` /// pub struct BlockCursor<'a> { + pub(super) read_compressed: bool, reader: BlockReaderRef<'a>, } impl<'a> BlockCursor<'a> { pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self { - BlockCursor { reader } + Self::new_with_compression(reader, false) + } + pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self { + BlockCursor { + read_compressed, + reader, + } } // Needed by cli pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self { BlockCursor { + read_compressed: false, reader: BlockReaderRef::FileBlockReader(reader), } } @@ -166,11 +196,17 @@ pub struct FileBlockReader<'a> { /// Unique ID of this file, used as key in the page cache. file_id: page_cache::FileId, + + compressed_reads: bool, } impl<'a> FileBlockReader<'a> { pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self { - FileBlockReader { file_id, file } + FileBlockReader { + file_id, + file, + compressed_reads: true, + } } /// Read a page from the underlying file into given buffer. @@ -217,7 +253,10 @@ impl<'a> FileBlockReader<'a> { impl BlockReader for FileBlockReader<'_> { fn block_cursor(&self) -> BlockCursor<'_> { - BlockCursor::new(BlockReaderRef::FileBlockReader(self)) + BlockCursor::new_with_compression( + BlockReaderRef::FileBlockReader(self), + self.compressed_reads, + ) } } diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index 79cc7bf15373..bb65ae24fc5e 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -21,6 +21,7 @@ pub struct EphemeralFile { } mod page_caching; +pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite; mod zero_padded_read_write; impl EphemeralFile { @@ -53,7 +54,7 @@ impl EphemeralFile { Ok(EphemeralFile { _tenant_shard_id: tenant_shard_id, _timeline_id: timeline_id, - rw: page_caching::RW::new(file), + rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()), }) } @@ -65,6 +66,11 @@ impl EphemeralFile { self.rw.page_cache_file_id() } + /// See [`self::page_caching::RW::load_to_vec`]. + pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { + self.rw.load_to_vec(ctx).await + } + pub(crate) async fn read_blk( &self, blknum: u32, diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs index 276ac8706493..43b9fff28d98 100644 --- a/pageserver/src/tenant/ephemeral_file/page_caching.rs +++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs @@ -8,6 +8,7 @@ use crate::virtual_file::VirtualFile; use once_cell::sync::Lazy; use std::io::{self, ErrorKind}; +use std::ops::{Deref, Range}; use tokio_epoll_uring::BoundedBuf; use tracing::*; @@ -19,14 +20,23 @@ pub struct RW { rw: super::zero_padded_read_write::RW, } +/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`], +/// should we pre-warm the [`crate::page_cache`] with the contents? +#[derive(Clone, Copy)] +pub enum PrewarmOnWrite { + Yes, + No, +} + impl RW { - pub fn new(file: VirtualFile) -> Self { + pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self { let page_cache_file_id = page_cache::next_file_id(); Self { page_cache_file_id, rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new( page_cache_file_id, file, + prewarm_on_write, )), } } @@ -49,6 +59,43 @@ impl RW { self.rw.bytes_written() } + /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer. + /// + /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer. + /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`]. + pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result, io::Error> { + // round up to the next PAGE_SZ multiple, required by blob_io + let size = { + let s = usize::try_from(self.bytes_written()).unwrap(); + if s % PAGE_SZ == 0 { + s + } else { + s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap() + } + }; + let vec = Vec::with_capacity(size); + + // read from disk what we've already flushed + let writer = self.rw.as_writer(); + let flushed_range = writer.written_range(); + let mut vec = writer + .file + .read_exact_at( + vec.slice(0..(flushed_range.end - flushed_range.start)), + u64::try_from(flushed_range.start).unwrap(), + ctx, + ) + .await? + .into_inner(); + + // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk + let buffered = self.rw.get_tail_zero_padded(); + vec.extend_from_slice(buffered); + assert_eq!(vec.len(), size); + assert_eq!(vec.len() % PAGE_SZ, 0); + Ok(vec) + } + pub(crate) async fn read_blk( &self, blknum: u32, @@ -116,19 +163,40 @@ impl Drop for RW { } struct PreWarmingWriter { + prewarm_on_write: PrewarmOnWrite, nwritten_blocks: u32, page_cache_file_id: page_cache::FileId, file: VirtualFile, } impl PreWarmingWriter { - fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self { + fn new( + page_cache_file_id: page_cache::FileId, + file: VirtualFile, + prewarm_on_write: PrewarmOnWrite, + ) -> Self { Self { + prewarm_on_write, nwritten_blocks: 0, page_cache_file_id, file, } } + + /// Return the byte range within `file` that has been written though `write_all`. + /// + /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`. + fn written_range(&self) -> (impl Deref> + '_) { + let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap(); + struct Wrapper(Range); + impl Deref for Wrapper { + type Target = Range; + fn deref(&self) -> &Range { + &self.0 + } + } + Wrapper(0..nwritten_blocks * PAGE_SZ) + } } impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter { @@ -178,45 +246,51 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi assert_eq!(&check_bounds_stuff_works, &*buf); } - // Pre-warm page cache with the contents. - // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming - // benefits the code that writes InMemoryLayer=>L0 layers. let nblocks = buflen / PAGE_SZ; let nblocks32 = u32::try_from(nblocks).unwrap(); - let cache = page_cache::get(); - static CTX: Lazy = Lazy::new(|| { - RequestContext::new( - crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache, - crate::context::DownloadBehavior::Error, - ) - }); - for blknum_in_buffer in 0..nblocks { - let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ]; - let blknum = self - .nwritten_blocks - .checked_add(blknum_in_buffer as u32) - .unwrap(); - match cache - .read_immutable_buf(self.page_cache_file_id, blknum, &CTX) - .await - { - Err(e) => { - error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); - // fail gracefully, it's not the end of the world if we can't pre-warm the cache here - } - Ok(v) => match v { - page_cache::ReadBufResult::Found(_guard) => { - // This function takes &mut self, so, it shouldn't be possible to reach this point. - unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \ - and this function takes &mut self, so, no concurrent read_blk is possible"); - } - page_cache::ReadBufResult::NotFound(mut write_guard) => { - write_guard.copy_from_slice(blk_in_buffer); - let _ = write_guard.mark_valid(); + + if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) { + // Pre-warm page cache with the contents. + // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming + // benefits the code that writes InMemoryLayer=>L0 layers. + + let cache = page_cache::get(); + static CTX: Lazy = Lazy::new(|| { + RequestContext::new( + crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache, + crate::context::DownloadBehavior::Error, + ) + }); + for blknum_in_buffer in 0..nblocks { + let blk_in_buffer = + &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ]; + let blknum = self + .nwritten_blocks + .checked_add(blknum_in_buffer as u32) + .unwrap(); + match cache + .read_immutable_buf(self.page_cache_file_id, blknum, &CTX) + .await + { + Err(e) => { + error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}"); + // fail gracefully, it's not the end of the world if we can't pre-warm the cache here } - }, + Ok(v) => match v { + page_cache::ReadBufResult::Found(_guard) => { + // This function takes &mut self, so, it shouldn't be possible to reach this point. + unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \ + and this function takes &mut self, so, no concurrent read_blk is possible"); + } + page_cache::ReadBufResult::NotFound(mut write_guard) => { + write_guard.copy_from_slice(blk_in_buffer); + let _ = write_guard.mark_valid(); + } + }, + } } } + self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap(); Ok((buflen, buf.into_inner())) } diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs index b37eafb52c5b..fe310acab888 100644 --- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs +++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs @@ -75,6 +75,21 @@ where flushed_offset + u64::try_from(buffer.pending()).unwrap() } + /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`]. + pub fn get_tail_zero_padded(&self) -> &[u8] { + let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); + let buffer_written_up_to = buffer.pending(); + // pad to next page boundary + let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 { + buffer_written_up_to + } else { + buffer_written_up_to + .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ)) + .unwrap() + }; + &buffer.as_zero_padded_slice()[0..read_up_to] + } + pub(crate) async fn read_blk(&self, blknum: u32) -> Result, std::io::Error> { let flushed_offset = self.buffered_writer.as_inner().bytes_written(); let buffer: &zero_padded::Buffer = self.buffered_writer.inspect_buffer(); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 08c3f19b6f75..b0159e22bfc0 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -43,7 +43,8 @@ use crate::tenant::config::{ use crate::tenant::span::debug_assert_current_span_has_tenant_id; use crate::tenant::storage_layer::inmemory_layer; use crate::tenant::timeline::ShutdownMode; -use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState}; +use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState}; +use crate::virtual_file::MaybeFatalIo; use crate::{InitializationOrder, TEMP_FILE_SUFFIX}; use utils::crashsafe::path_with_suffix_extension; @@ -272,7 +273,7 @@ pub struct TenantManager { } fn emergency_generations( - tenant_confs: &HashMap>, + tenant_confs: &HashMap>, ) -> HashMap { tenant_confs .iter() @@ -296,7 +297,7 @@ fn emergency_generations( async fn init_load_generations( conf: &'static PageServerConf, - tenant_confs: &HashMap>, + tenant_confs: &HashMap>, resources: &TenantSharedResources, cancel: &CancellationToken, ) -> anyhow::Result>> { @@ -346,56 +347,32 @@ async fn init_load_generations( /// Given a directory discovered in the pageserver's tenants/ directory, attempt /// to load a tenant config from it. /// -/// If file is missing, return Ok(None) +/// If we cleaned up something expected (like an empty dir or a temp dir), return None. fn load_tenant_config( conf: &'static PageServerConf, + tenant_shard_id: TenantShardId, dentry: Utf8DirEntry, -) -> anyhow::Result)>> { +) -> Option> { let tenant_dir_path = dentry.path().to_path_buf(); if crate::is_temporary(&tenant_dir_path) { info!("Found temporary tenant directory, removing: {tenant_dir_path}"); // No need to use safe_remove_tenant_dir_all because this is already // a temporary path - if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) { - error!( - "Failed to remove temporary directory '{}': {:?}", - tenant_dir_path, e - ); - } - return Ok(None); + std::fs::remove_dir_all(&tenant_dir_path).fatal_err("delete temporary tenant dir"); + return None; } // This case happens if we crash during attachment before writing a config into the dir let is_empty = tenant_dir_path .is_empty_dir() - .with_context(|| format!("Failed to check whether {tenant_dir_path:?} is an empty dir"))?; + .fatal_err("Checking for empty tenant dir"); if is_empty { info!("removing empty tenant directory {tenant_dir_path:?}"); - if let Err(e) = std::fs::remove_dir(&tenant_dir_path) { - error!( - "Failed to remove empty tenant directory '{}': {e:#}", - tenant_dir_path - ) - } - return Ok(None); + std::fs::remove_dir(&tenant_dir_path).fatal_err("delete empty tenant dir"); + return None; } - let tenant_shard_id = match tenant_dir_path - .file_name() - .unwrap_or_default() - .parse::() - { - Ok(id) => id, - Err(_) => { - warn!("Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",); - return Ok(None); - } - }; - - Ok(Some(( - tenant_shard_id, - Tenant::load_tenant_config(conf, &tenant_shard_id), - ))) + Some(Tenant::load_tenant_config(conf, &tenant_shard_id)) } /// Initial stage of load: walk the local tenants directory, clean up any temp files, @@ -405,32 +382,51 @@ fn load_tenant_config( /// seconds even on reasonably fast drives. async fn init_load_tenant_configs( conf: &'static PageServerConf, -) -> anyhow::Result>> { +) -> HashMap> { let tenants_dir = conf.tenants_path(); - let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result> { - let dir_entries = tenants_dir - .read_dir_utf8() - .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?; + let dentries = tokio::task::spawn_blocking(move || -> Vec { + let context = format!("read tenants dir {tenants_dir}"); + let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context); - Ok(dir_entries.collect::, std::io::Error>>()?) + dir_entries + .collect::, std::io::Error>>() + .fatal_err(&context) }) - .await??; + .await + .expect("Config load task panicked"); let mut configs = HashMap::new(); let mut join_set = JoinSet::new(); for dentry in dentries { - join_set.spawn_blocking(move || load_tenant_config(conf, dentry)); + let tenant_shard_id = match dentry.file_name().parse::() { + Ok(id) => id, + Err(_) => { + warn!( + "Invalid tenant path (garbage in our repo directory?): '{}'", + dentry.file_name() + ); + continue; + } + }; + + join_set.spawn_blocking(move || { + ( + tenant_shard_id, + load_tenant_config(conf, tenant_shard_id, dentry), + ) + }); } while let Some(r) = join_set.join_next().await { - if let Some((tenant_id, tenant_config)) = r?? { - configs.insert(tenant_id, tenant_config); + let (tenant_shard_id, tenant_config) = r.expect("Panic in config load task"); + if let Some(tenant_config) = tenant_config { + configs.insert(tenant_shard_id, tenant_config); } } - Ok(configs) + configs } #[derive(Debug, thiserror::Error)] @@ -472,7 +468,7 @@ pub async fn init_tenant_mgr( ); // Scan local filesystem for attached tenants - let tenant_configs = init_load_tenant_configs(conf).await?; + let tenant_configs = init_load_tenant_configs(conf).await; // Determine which tenants are to be secondary or attached, and in which generation let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?; @@ -590,31 +586,23 @@ pub async fn init_tenant_mgr( ); // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running for (tenant_shard_id, location_conf, config_write_result) in config_write_results { - // Errors writing configs are fatal - config_write_result?; + // Writing a config to local disk is foundational to startup up tenants: panic if we can't. + config_write_result.fatal_err("write tenant shard config file"); let tenant_dir_path = conf.tenant_path(&tenant_shard_id); let shard_identity = location_conf.shard; let slot = match location_conf.mode { - LocationMode::Attached(attached_conf) => { - match tenant_spawn( - conf, - tenant_shard_id, - &tenant_dir_path, - resources.clone(), - AttachedTenantConf::new(location_conf.tenant_conf, attached_conf), - shard_identity, - Some(init_order.clone()), - SpawnMode::Lazy, - &ctx, - ) { - Ok(tenant) => TenantSlot::Attached(tenant), - Err(e) => { - error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}"); - continue; - } - } - } + LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn( + conf, + tenant_shard_id, + &tenant_dir_path, + resources.clone(), + AttachedTenantConf::new(location_conf.tenant_conf, attached_conf), + shard_identity, + Some(init_order.clone()), + SpawnMode::Lazy, + &ctx, + )), LocationMode::Secondary(secondary_conf) => { info!( tenant_id = %tenant_shard_id.tenant_id, @@ -649,8 +637,7 @@ pub async fn init_tenant_mgr( }) } -/// Wrapper for Tenant::spawn that checks invariants before running, and inserts -/// a broken tenant in the map if Tenant::spawn fails. +/// Wrapper for Tenant::spawn that checks invariants before running #[allow(clippy::too_many_arguments)] fn tenant_spawn( conf: &'static PageServerConf, @@ -662,23 +649,18 @@ fn tenant_spawn( init_order: Option, mode: SpawnMode, ctx: &RequestContext, -) -> anyhow::Result> { - anyhow::ensure!( - tenant_path.is_dir(), - "Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory" - ); - anyhow::ensure!( - !crate::is_temporary(tenant_path), - "Cannot load tenant from temporary path {tenant_path:?}" - ); - anyhow::ensure!( - !tenant_path.is_empty_dir().with_context(|| { - format!("Failed to check whether {tenant_path:?} is an empty dir") - })?, - "Cannot load tenant from empty directory {tenant_path:?}" - ); - - let tenant = Tenant::spawn( +) -> Arc { + // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed + // path, and contains a configuration file. Assertions that do synchronous I/O are limited to debug mode + // to avoid impacting prod runtime performance. + assert!(!crate::is_temporary(tenant_path)); + debug_assert!(tenant_path.is_dir()); + debug_assert!(conf + .tenant_location_config_path(&tenant_shard_id) + .try_exists() + .unwrap()); + + Tenant::spawn( conf, tenant_shard_id, resources, @@ -687,9 +669,7 @@ fn tenant_spawn( init_order, mode, ctx, - ); - - Ok(tenant) + ) } async fn shutdown_all_tenants0(tenants: &std::sync::RwLock) { @@ -840,8 +820,9 @@ pub(crate) enum UpsertLocationError { #[error("Failed to flush: {0}")] Flush(anyhow::Error), + /// This error variant is for unexpected situations (soft assertions) where the system is in an unexpected state. #[error("Internal error: {0}")] - Other(#[from] anyhow::Error), + InternalError(anyhow::Error), } impl TenantManager { @@ -971,7 +952,8 @@ impl TenantManager { match fast_path_taken { Some(FastPathModified::Attached(tenant)) => { Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await?; + .await + .fatal_err("write tenant shard config"); // Transition to AttachedStale means we may well hold a valid generation // still, and have been requested to go stale as part of a migration. If @@ -1001,7 +983,8 @@ impl TenantManager { } Some(FastPathModified::Secondary(_secondary_tenant)) => { Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) - .await?; + .await + .fatal_err("write tenant shard config"); return Ok(None); } @@ -1067,7 +1050,7 @@ impl TenantManager { Some(TenantSlot::InProgress(_)) => { // This should never happen: acquire_slot should error out // if the contents of a slot were InProgress. - return Err(UpsertLocationError::Other(anyhow::anyhow!( + return Err(UpsertLocationError::InternalError(anyhow::anyhow!( "Acquired an InProgress slot, this is a bug." ))); } @@ -1086,12 +1069,14 @@ impl TenantManager { // Does not need to be fsync'd because local storage is just a cache. tokio::fs::create_dir_all(&timelines_path) .await - .with_context(|| format!("Creating {timelines_path}"))?; + .fatal_err("create timelines/ dir"); // Before activating either secondary or attached mode, persist the // configuration, so that on restart we will re-attach (or re-start // secondary) on the tenant. - Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?; + Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config) + .await + .fatal_err("write tenant shard config"); let new_slot = match &new_location_config.mode { LocationMode::Secondary(secondary_config) => { @@ -1110,13 +1095,15 @@ impl TenantManager { // from upserts. This enables creating generation-less tenants even though neon_local // always uses generations when calling the location conf API. let attached_conf = if cfg!(feature = "testing") { - let mut conf = AttachedTenantConf::try_from(new_location_config)?; + let mut conf = AttachedTenantConf::try_from(new_location_config) + .map_err(UpsertLocationError::BadRequest)?; if self.conf.control_plane_api.is_none() { conf.location.generation = Generation::none(); } conf } else { - AttachedTenantConf::try_from(new_location_config)? + AttachedTenantConf::try_from(new_location_config) + .map_err(UpsertLocationError::BadRequest)? }; let tenant = tenant_spawn( @@ -1129,7 +1116,7 @@ impl TenantManager { None, spawn_mode, ctx, - )?; + ); TenantSlot::Attached(tenant) } @@ -1143,7 +1130,7 @@ impl TenantManager { match slot_guard.upsert(new_slot) { Err(TenantSlotUpsertError::InternalError(e)) => { - Err(UpsertLocationError::Other(anyhow::anyhow!(e))) + Err(UpsertLocationError::InternalError(anyhow::anyhow!(e))) } Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)), Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => { @@ -1250,7 +1237,7 @@ impl TenantManager { None, SpawnMode::Eager, ctx, - )?; + ); slot_guard.upsert(TenantSlot::Attached(tenant))?; @@ -1984,7 +1971,7 @@ impl TenantManager { None, SpawnMode::Eager, ctx, - )?; + ); slot_guard.upsert(TenantSlot::Attached(tenant))?; diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs index e33e4b84aa97..bc9364de61d4 100644 --- a/pageserver/src/tenant/remote_timeline_client.rs +++ b/pageserver/src/tenant/remote_timeline_client.rs @@ -519,7 +519,7 @@ impl RemoteTimelineClient { local_path: &Utf8Path, cancel: &CancellationToken, ctx: &RequestContext, - ) -> anyhow::Result { + ) -> Result { let downloaded_size = { let _unfinished_gauge_guard = self.metrics.call_begin( &RemoteOpFileKind::Layer, diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index af6840f525ae..a233d11c4a11 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -23,6 +23,8 @@ use super::{ storage_layer::LayerName, }; +use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE; +use metrics::UIntGauge; use pageserver_api::{ models, shard::{ShardIdentity, TenantShardId}, @@ -99,6 +101,17 @@ pub(crate) struct SecondaryTenant { // Public state indicating overall progress of downloads relative to the last heatmap seen pub(crate) progress: std::sync::Mutex, + + // Sum of layer sizes on local disk + pub(super) resident_size_metric: UIntGauge, +} + +impl Drop for SecondaryTenant { + fn drop(&mut self) { + let tenant_id = self.tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", self.tenant_shard_id.shard_slug()); + let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]); + } } impl SecondaryTenant { @@ -108,6 +121,12 @@ impl SecondaryTenant { tenant_conf: TenantConfOpt, config: &SecondaryLocationConfig, ) -> Arc { + let tenant_id = tenant_shard_id.tenant_id.to_string(); + let shard_id = format!("{}", tenant_shard_id.shard_slug()); + let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id]) + .unwrap(); + Arc::new(Self { tenant_shard_id, // todo: shall we make this a descendent of the @@ -123,6 +142,8 @@ impl SecondaryTenant { detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())), progress: std::sync::Mutex::default(), + + resident_size_metric, }) } @@ -211,16 +232,12 @@ impl SecondaryTenant { // have to 100% match what is on disk, because it's a best-effort warming // of the cache. let mut detail = this.detail.lock().unwrap(); - if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) { - let removed = timeline_detail.on_disk_layers.remove(&name); - - // We might race with removal of the same layer during downloads, if it was removed - // from the heatmap. If we see that the OnDiskState is gone, then no need to - // do a physical deletion or store in evicted_at. - if let Some(removed) = removed { - removed.remove_blocking(); - timeline_detail.evicted_at.insert(name, now); - } + if let Some(removed) = + detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric) + { + // We might race with removal of the same layer during downloads, so finding the layer we + // were trying to remove is optional. Only issue the disk I/O to remove it if we found it. + removed.remove_blocking(); } }) .await diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 24176ecf1956..27439d4f030d 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -46,6 +46,7 @@ use crate::tenant::{ use camino::Utf8PathBuf; use chrono::format::{DelayedFormat, StrftimeItems}; use futures::Future; +use metrics::UIntGauge; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; use remote_storage::{DownloadError, Etag, GenericRemoteStorage}; @@ -131,16 +132,66 @@ impl OnDiskState { .or_else(fs_ext::ignore_not_found) .fatal_err("Deleting secondary layer") } + + pub(crate) fn file_size(&self) -> u64 { + self.metadata.file_size + } } #[derive(Debug, Clone, Default)] pub(super) struct SecondaryDetailTimeline { - pub(super) on_disk_layers: HashMap, + on_disk_layers: HashMap, /// We remember when layers were evicted, to prevent re-downloading them. pub(super) evicted_at: HashMap, } +impl SecondaryDetailTimeline { + pub(super) fn remove_layer( + &mut self, + name: &LayerName, + resident_metric: &UIntGauge, + ) -> Option { + let removed = self.on_disk_layers.remove(name); + if let Some(removed) = &removed { + resident_metric.sub(removed.file_size()); + } + removed + } + + /// `local_path` + fn touch_layer( + &mut self, + conf: &'static PageServerConf, + tenant_shard_id: &TenantShardId, + timeline_id: &TimelineId, + touched: &HeatMapLayer, + resident_metric: &UIntGauge, + local_path: F, + ) where + F: FnOnce() -> Utf8PathBuf, + { + use std::collections::hash_map::Entry; + match self.on_disk_layers.entry(touched.name.clone()) { + Entry::Occupied(mut v) => { + v.get_mut().access_time = touched.access_time; + } + Entry::Vacant(e) => { + e.insert(OnDiskState::new( + conf, + tenant_shard_id, + timeline_id, + touched.name.clone(), + touched.metadata.clone(), + touched.access_time, + local_path(), + )); + resident_metric.add(touched.metadata.file_size); + } + } + } +} + // Aspects of a heatmap that we remember after downloading it #[derive(Clone, Debug)] struct DownloadSummary { @@ -158,7 +209,7 @@ pub(super) struct SecondaryDetail { last_download: Option, next_download: Option, - pub(super) timelines: HashMap, + timelines: HashMap, } /// Helper for logging SystemTime @@ -191,6 +242,38 @@ impl SecondaryDetail { } } + pub(super) fn evict_layer( + &mut self, + name: LayerName, + timeline_id: &TimelineId, + now: SystemTime, + resident_metric: &UIntGauge, + ) -> Option { + let timeline = self.timelines.get_mut(timeline_id)?; + let removed = timeline.remove_layer(&name, resident_metric); + if removed.is_some() { + timeline.evicted_at.insert(name, now); + } + removed + } + + pub(super) fn remove_timeline( + &mut self, + timeline_id: &TimelineId, + resident_metric: &UIntGauge, + ) { + let removed = self.timelines.remove(timeline_id); + if let Some(removed) = removed { + resident_metric.sub( + removed + .on_disk_layers + .values() + .map(|l| l.metadata.file_size) + .sum(), + ); + } + } + /// Additionally returns the total number of layers, used for more stable relative access time /// based eviction. pub(super) fn get_layers_for_eviction( @@ -262,6 +345,7 @@ impl scheduler::RunningJob for RunningDownload { struct CompleteDownload { secondary_state: Arc, completed_at: Instant, + result: Result<(), UpdateError>, } impl scheduler::Completion for CompleteDownload { @@ -286,21 +370,33 @@ impl JobGenerator { + // Start downloading again as soon as we can. This will involve waiting for the scheduler's + // scheduling interval. This slightly reduces the peak download speed of tenants that hit their + // deadline and keep restarting, but that also helps give other tenants a chance to execute rather + // that letting one big tenant dominate for a long time. + detail.next_download = Some(Instant::now()); + } + _ => { + let period = detail + .last_download + .as_ref() + .map(|d| d.upload_period) + .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL); + + // We advance next_download irrespective of errors: we don't want error cases to result in + // expensive busy-polling. + detail.next_download = Some(Instant::now() + period_jitter(period, 5)); + } + } } async fn schedule(&mut self) -> SchedulingResult { @@ -396,9 +492,10 @@ impl JobGenerator { tracing::info!("No heatmap found for tenant. This is fine if it is new."); @@ -415,6 +512,9 @@ impl JobGenerator { tracing::error!("Error while downloading tenant: {e}"); }, + Err(UpdateError::Restart) => { + tracing::info!("Download reached deadline & will restart to update heatmap") + } Ok(()) => {} }; @@ -436,6 +536,7 @@ impl JobGenerator { /// Errors that may be encountered while updating a tenant #[derive(thiserror::Error, Debug)] enum UpdateError { + /// This is not a true failure, but it's how a download indicates that it would like to be restarted by + /// the scheduler, to pick up the latest heatmap + #[error("Reached deadline, restarting downloads")] + Restart, + #[error("No remote data found")] NoData, #[error("Insufficient local storage space")] @@ -578,8 +684,13 @@ impl<'a> TenantDownloader<'a> { Some(t) => t, None => { // We have no existing state: need to scan local disk for layers first. - let timeline_state = - init_timeline_state(self.conf, tenant_shard_id, timeline).await; + let timeline_state = init_timeline_state( + self.conf, + tenant_shard_id, + timeline, + &self.secondary_state.resident_size_metric, + ) + .await; // Re-acquire detail lock now that we're done with async load from local FS self.secondary_state @@ -603,6 +714,26 @@ impl<'a> TenantDownloader<'a> { self.prepare_timelines(&heatmap, heatmap_mtime).await?; } + // Calculate a deadline for downloads: if downloading takes longer than this, it is useful to drop out and start again, + // so that we are always using reasonably a fresh heatmap. Otherwise, if we had really huge content to download, we might + // spend 10s of minutes downloading layers we don't need. + // (see https://github.com/neondatabase/neon/issues/8182) + let deadline = { + let period = self + .secondary_state + .detail + .lock() + .unwrap() + .last_download + .as_ref() + .map(|d| d.upload_period) + .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL); + + // Use double the period: we are not promising to complete within the period, this is just a heuristic + // to keep using a "reasonably fresh" heatmap. + Instant::now() + period * 2 + }; + // Download the layers in the heatmap for timeline in heatmap.timelines { let timeline_state = timeline_states @@ -618,7 +749,7 @@ impl<'a> TenantDownloader<'a> { } let timeline_id = timeline.timeline_id; - self.download_timeline(timeline, timeline_state, ctx) + self.download_timeline(timeline, timeline_state, deadline, ctx) .instrument(tracing::info_span!( "secondary_download_timeline", tenant_id=%tenant_shard_id.tenant_id, @@ -628,6 +759,25 @@ impl<'a> TenantDownloader<'a> { .await?; } + // Metrics consistency check in testing builds + if cfg!(feature = "testing") { + let detail = self.secondary_state.detail.lock().unwrap(); + let resident_size = detail + .timelines + .values() + .map(|tl| { + tl.on_disk_layers + .values() + .map(|v| v.metadata.file_size) + .sum::() + }) + .sum::(); + assert_eq!( + resident_size, + self.secondary_state.resident_size_metric.get() + ); + } + // Only update last_etag after a full successful download: this way will not skip // the next download, even if the heatmap's actual etag is unchanged. self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary { @@ -740,7 +890,7 @@ impl<'a> TenantDownloader<'a> { for delete_timeline in &delete_timelines { // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal // from disk fails that will be a fatal error. - detail.timelines.remove(delete_timeline); + detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric); } } @@ -758,7 +908,7 @@ impl<'a> TenantDownloader<'a> { let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else { continue; }; - timeline_state.on_disk_layers.remove(&layer_name); + timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric); } for timeline_id in delete_timelines { @@ -827,26 +977,28 @@ impl<'a> TenantDownloader<'a> { .and_then(|x| x) } - async fn download_timeline( + /// Download heatmap layers that are not present on local disk, or update their + /// access time if they are already present. + async fn download_timeline_layers( &self, + tenant_shard_id: &TenantShardId, timeline: HeatMapTimeline, timeline_state: SecondaryDetailTimeline, + deadline: Instant, ctx: &RequestContext, - ) -> Result<(), UpdateError> { - debug_assert_current_span_has_tenant_and_timeline_id(); - let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); - + ) -> (Result<(), UpdateError>, Vec) { // Accumulate updates to the state let mut touched = Vec::new(); - tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); - - // Download heatmap layers that are not present on local disk, or update their - // access time if they are already present. for layer in timeline.layers { if self.secondary_state.cancel.is_cancelled() { tracing::debug!("Cancelled -- dropping out of layer loop"); - return Err(UpdateError::Cancelled); + return (Err(UpdateError::Cancelled), touched); + } + + if Instant::now() > deadline { + // We've been running downloads for a while, restart to download latest heatmap. + return (Err(UpdateError::Restart), touched); } // Existing on-disk layers: just update their access time. @@ -916,52 +1068,66 @@ impl<'a> TenantDownloader<'a> { match self .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx) - .await? + .await { - Some(layer) => touched.push(layer), - None => { + Ok(Some(layer)) => touched.push(layer), + Ok(None) => { // Not an error but we didn't download it: remote layer is missing. Don't add it to the list of // things to consider touched. } + Err(e) => { + return (Err(e), touched); + } } } - // Write updates to state to record layers we just downloaded or touched. + (Ok(()), touched) + } + + async fn download_timeline( + &self, + timeline: HeatMapTimeline, + timeline_state: SecondaryDetailTimeline, + deadline: Instant, + ctx: &RequestContext, + ) -> Result<(), UpdateError> { + debug_assert_current_span_has_tenant_and_timeline_id(); + let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); + let timeline_id = timeline.timeline_id; + + tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len()); + + let (result, touched) = self + .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx) + .await; + + // Write updates to state to record layers we just downloaded or touched, irrespective of whether the overall result was successful { let mut detail = self.secondary_state.detail.lock().unwrap(); - let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default(); + let timeline_detail = detail.timelines.entry(timeline_id).or_default(); tracing::info!("Wrote timeline_detail for {} touched layers", touched.len()); - - for t in touched { - use std::collections::hash_map::Entry; - match timeline_detail.on_disk_layers.entry(t.name.clone()) { - Entry::Occupied(mut v) => { - v.get_mut().access_time = t.access_time; - } - Entry::Vacant(e) => { - let local_path = local_layer_path( + touched.into_iter().for_each(|t| { + timeline_detail.touch_layer( + self.conf, + tenant_shard_id, + &timeline_id, + &t, + &self.secondary_state.resident_size_metric, + || { + local_layer_path( self.conf, tenant_shard_id, - &timeline.timeline_id, + &timeline_id, &t.name, &t.metadata.generation, - ); - e.insert(OnDiskState::new( - self.conf, - tenant_shard_id, - &timeline.timeline_id, - t.name, - t.metadata.clone(), - t.access_time, - local_path, - )); - } - } - } + ) + }, + ) + }); } - Ok(()) + result } /// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics @@ -1067,6 +1233,7 @@ async fn init_timeline_state( conf: &'static PageServerConf, tenant_shard_id: &TenantShardId, heatmap: &HeatMapTimeline, + resident_metric: &UIntGauge, ) -> SecondaryDetailTimeline { let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id); let mut detail = SecondaryDetailTimeline::default(); @@ -1142,17 +1309,13 @@ async fn init_timeline_state( } else { // We expect the access time to be initialized immediately afterwards, when // the latest heatmap is applied to the state. - detail.on_disk_layers.insert( - name.clone(), - OnDiskState::new( - conf, - tenant_shard_id, - &heatmap.timeline_id, - name, - remote_meta.metadata.clone(), - remote_meta.access_time, - file_path, - ), + detail.touch_layer( + conf, + tenant_shard_id, + &heatmap.timeline_id, + remote_meta, + resident_metric, + || file_path, ); } } diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs index b2338b620ebf..23354417e788 100644 --- a/pageserver/src/tenant/size.rs +++ b/pageserver/src/tenant/size.rs @@ -3,6 +3,7 @@ use std::collections::hash_map::Entry; use std::collections::{HashMap, HashSet}; use std::sync::Arc; +use tenant_size_model::svg::SvgBranchKind; use tokio::sync::oneshot::error::RecvError; use tokio::sync::Semaphore; use tokio_util::sync::CancellationToken; @@ -87,6 +88,9 @@ impl SegmentMeta { LsnKind::BranchPoint => true, LsnKind::GcCutOff => true, LsnKind::BranchEnd => false, + LsnKind::LeasePoint => true, + LsnKind::LeaseStart => false, + LsnKind::LeaseEnd => false, } } } @@ -103,6 +107,21 @@ pub enum LsnKind { GcCutOff, /// Last record LSN BranchEnd, + /// A LSN lease is granted here. + LeasePoint, + /// A lease starts from here. + LeaseStart, + /// Last record LSN for the lease (should have the same LSN as the previous [`LsnKind::LeaseStart`]). + LeaseEnd, +} + +impl From for SvgBranchKind { + fn from(kind: LsnKind) -> Self { + match kind { + LsnKind::LeasePoint | LsnKind::LeaseStart | LsnKind::LeaseEnd => SvgBranchKind::Lease, + _ => SvgBranchKind::Timeline, + } + } } /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as @@ -124,6 +143,9 @@ pub struct TimelineInputs { /// Cutoff point calculated from the user-supplied 'max_retention_period' retention_param_cutoff: Option, + + /// Lease points on the timeline + lease_points: Vec, } /// Gathers the inputs for the tenant sizing model. @@ -234,6 +256,13 @@ pub(super) async fn gather_inputs( None }; + let lease_points = gc_info + .leases + .keys() + .filter(|&&lsn| lsn > ancestor_lsn) + .copied() + .collect::>(); + // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we // want to query any logical size before initdb_lsn. let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn); @@ -248,6 +277,8 @@ pub(super) async fn gather_inputs( .map(|lsn| (lsn, LsnKind::BranchPoint)) .collect::>(); + lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint))); + drop(gc_info); // Add branch points we collected earlier, just in case there were any that were @@ -296,6 +327,7 @@ pub(super) async fn gather_inputs( if kind == LsnKind::BranchPoint { branchpoint_segments.insert((timeline_id, lsn), segments.len()); } + segments.push(SegmentMeta { segment: Segment { parent: Some(parent), @@ -306,7 +338,45 @@ pub(super) async fn gather_inputs( timeline_id: timeline.timeline_id, kind, }); - parent += 1; + + parent = segments.len() - 1; + + if kind == LsnKind::LeasePoint { + // Needs `LeaseStart` and `LeaseEnd` as well to model lease as a read-only branch that never writes data + // (i.e. it's lsn has not advanced from ancestor_lsn), and therefore the three segments have the same LSN + // value. Without the other two segments, the calculation code would not count the leased LSN as a point + // to be retained. + // Did not use `BranchStart` or `BranchEnd` so we can differentiate branches and leases during debug. + // + // Alt Design: rewrite the entire calculation code to be independent of timeline id. Both leases and + // branch points can be given a synthetic id so we can unite them. + let mut lease_parent = parent; + + // Start of a lease. + segments.push(SegmentMeta { + segment: Segment { + parent: Some(lease_parent), + lsn: lsn.0, + size: None, // Filled in later, if necessary + needed: lsn > next_gc_cutoff, // only needed if the point is within rentention. + }, + timeline_id: timeline.timeline_id, + kind: LsnKind::LeaseStart, + }); + lease_parent += 1; + + // End of the lease. + segments.push(SegmentMeta { + segment: Segment { + parent: Some(lease_parent), + lsn: lsn.0, + size: None, // Filled in later, if necessary + needed: true, // everything at the lease LSN must be readable => is needed + }, + timeline_id: timeline.timeline_id, + kind: LsnKind::LeaseEnd, + }); + } } // Current end of the timeline @@ -332,6 +402,7 @@ pub(super) async fn gather_inputs( pitr_cutoff, next_gc_cutoff, retention_param_cutoff, + lease_points, }); } @@ -674,7 +745,8 @@ fn verify_size_for_multiple_branches() { "horizon_cutoff": "0/2210CD0", "pitr_cutoff": "0/2210CD0", "next_gc_cutoff": "0/2210CD0", - "retention_param_cutoff": null + "retention_param_cutoff": null, + "lease_points": [] }, { "timeline_id": "454626700469f0a9914949b9d018e876", @@ -684,7 +756,8 @@ fn verify_size_for_multiple_branches() { "horizon_cutoff": "0/1817770", "pitr_cutoff": "0/1817770", "next_gc_cutoff": "0/1817770", - "retention_param_cutoff": null + "retention_param_cutoff": null, + "lease_points": [] }, { "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f", @@ -694,7 +767,8 @@ fn verify_size_for_multiple_branches() { "horizon_cutoff": "0/18B3D98", "pitr_cutoff": "0/18B3D98", "next_gc_cutoff": "0/18B3D98", - "retention_param_cutoff": null + "retention_param_cutoff": null, + "lease_points": [] } ] } @@ -749,7 +823,8 @@ fn verify_size_for_one_branch() { "horizon_cutoff": "47/240A5860", "pitr_cutoff": "47/240A5860", "next_gc_cutoff": "47/240A5860", - "retention_param_cutoff": "0/0" + "retention_param_cutoff": "0/0", + "lease_points": [] } ] }"#; diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 9607546ce0f2..62730f88b260 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -7,6 +7,9 @@ pub(crate) mod layer; mod layer_desc; mod layer_name; +#[cfg(test)] +pub mod merge_iterator; + use crate::context::{AccessStatsBehavior, RequestContext}; use crate::repository::Value; use crate::task_mgr::TaskKind; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index bf5d9249ebb5..dfd0196c87e9 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -49,7 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; use pageserver_api::keyspace::KeySpace; -use pageserver_api::models::LayerAccessKind; +use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind}; use pageserver_api::shard::TenantShardId; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; @@ -223,6 +223,11 @@ pub struct DeltaLayerInner { file: VirtualFile, file_id: FileId, + #[allow(dead_code)] + layer_key_range: Range, + #[allow(dead_code)] + layer_lsn_range: Range, + max_vectored_read_bytes: Option, } @@ -452,7 +457,12 @@ impl DeltaLayerWriterInner { ctx: &RequestContext, ) -> (Vec, anyhow::Result<()>) { assert!(self.lsn_range.start <= lsn); - let (val, res) = self.blob_writer.write_blob(val, ctx).await; + // We don't want to use compression in delta layer creation + let compression = ImageCompressionAlgorithm::Disabled; + let (val, res) = self + .blob_writer + .write_blob_maybe_compressed(val, ctx, compression) + .await; let off = match res { Ok(off) => off, Err(e) => return (val, Err(anyhow::anyhow!(e))), @@ -737,6 +747,16 @@ impl DeltaLayer { } impl DeltaLayerInner { + #[cfg(test)] + pub(crate) fn key_range(&self) -> &Range { + &self.layer_key_range + } + + #[cfg(test)] + pub(crate) fn lsn_range(&self) -> &Range { + &self.layer_lsn_range + } + /// Returns nested result following Result, Critical>: /// - inner has the success or transient failure /// - outer has the permanent failure @@ -785,6 +805,8 @@ impl DeltaLayerInner { index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, max_vectored_read_bytes, + layer_key_range: actual_summary.key_range, + layer_lsn_range: actual_summary.lsn_range, })) } @@ -1492,6 +1514,24 @@ impl DeltaLayerInner { ); offset } + + #[cfg(test)] + pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> { + let block_reader = FileBlockReader::new(&self.file, self.file_id); + let tree_reader = + DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader); + DeltaLayerIterator { + delta_layer: self, + ctx, + index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx), + key_values_batch: std::collections::VecDeque::new(), + is_end: false, + planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new( + 1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer. + 1024, // The default value. Unit tests might use a different value + ), + } + } } /// A set of data associated with a delta layer key and its value @@ -1552,7 +1592,71 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del } #[cfg(test)] -mod test { +pub struct DeltaLayerIterator<'a> { + delta_layer: &'a DeltaLayerInner, + ctx: &'a RequestContext, + planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner, + index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>, + key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>, + is_end: bool, +} + +#[cfg(test)] +impl<'a> DeltaLayerIterator<'a> { + /// Retrieve a batch of key-value pairs into the iterator buffer. + async fn next_batch(&mut self) -> anyhow::Result<()> { + assert!(self.key_values_batch.is_empty()); + assert!(!self.is_end); + + let plan = loop { + if let Some(res) = self.index_iter.next().await { + let (raw_key, value) = res?; + let key = Key::from_slice(&raw_key[..KEY_SIZE]); + let lsn = DeltaKey::extract_lsn_from_buf(&raw_key); + let blob_ref = BlobRef(value); + let offset = blob_ref.pos(); + if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) { + break batch_plan; + } + } else { + self.is_end = true; + let data_end_offset = self.delta_layer.index_start_offset(); + break self.planner.handle_range_end(data_end_offset); + } + }; + let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file); + let mut next_batch = std::collections::VecDeque::new(); + let buf_size = plan.size(); + let buf = BytesMut::with_capacity(buf_size); + let blobs_buf = vectored_blob_reader + .read_blobs(&plan, buf, self.ctx) + .await?; + let frozen_buf = blobs_buf.buf.freeze(); + for meta in blobs_buf.blobs.iter() { + let value = Value::des(&frozen_buf[meta.start..meta.end])?; + next_batch.push_back((meta.meta.key, meta.meta.lsn, value)); + } + self.key_values_batch = next_batch; + Ok(()) + } + + pub async fn next(&mut self) -> anyhow::Result> { + if self.key_values_batch.is_empty() { + if self.is_end { + return Ok(None); + } + self.next_batch().await?; + } + Ok(Some( + self.key_values_batch + .pop_front() + .expect("should not be empty"), + )) + } +} + +#[cfg(test)] +pub(crate) mod test { use std::collections::BTreeMap; use itertools::MinMaxResult; @@ -1560,6 +1664,9 @@ mod test { use rand::RngCore; use super::*; + use crate::tenant::harness::TIMELINE_ID; + use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner; + use crate::tenant::Tenant; use crate::{ context::DownloadBehavior, task_mgr::TaskKind, @@ -2126,4 +2233,123 @@ mod test { assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right)); } } + + pub(crate) fn sort_delta( + (k1, l1, _): &(Key, Lsn, Value), + (k2, l2, _): &(Key, Lsn, Value), + ) -> std::cmp::Ordering { + (k1, l1).cmp(&(k2, l2)) + } + + pub(crate) async fn produce_delta_layer( + tenant: &Tenant, + tline: &Arc, + mut deltas: Vec<(Key, Lsn, Value)>, + ctx: &RequestContext, + ) -> anyhow::Result { + deltas.sort_by(sort_delta); + let (key_start, _, _) = deltas.first().unwrap(); + let (key_max, _, _) = deltas.first().unwrap(); + let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap(); + let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap(); + let lsn_end = Lsn(lsn_max.0 + 1); + let mut writer = DeltaLayerWriter::new( + tenant.conf, + tline.timeline_id, + tenant.tenant_shard_id, + *key_start, + (*lsn_min)..lsn_end, + ctx, + ) + .await?; + let key_end = key_max.next(); + + for (key, lsn, value) in deltas { + writer.put_value(key, lsn, value, ctx).await?; + } + let delta_layer = writer.finish(key_end, tline, ctx).await?; + + Ok::<_, anyhow::Error>(delta_layer) + } + + async fn assert_delta_iter_equal( + delta_iter: &mut DeltaLayerIterator<'_>, + expect: &[(Key, Lsn, Value)], + ) { + let mut expect_iter = expect.iter(); + loop { + let o1 = delta_iter.next().await.unwrap(); + let o2 = expect_iter.next(); + assert_eq!(o1.is_some(), o2.is_some()); + if o1.is_none() && o2.is_none() { + break; + } + let (k1, l1, v1) = o1.unwrap(); + let (k2, l2, v2) = o2.unwrap(); + assert_eq!(&k1, k2); + assert_eq!(l1, *l2); + assert_eq!(&v1, v2); + } + } + + #[tokio::test] + async fn delta_layer_iterator() { + use crate::repository::Value; + use bytes::Bytes; + + let harness = TenantHarness::create("delta_layer_iterator").unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + const N: usize = 1000; + let test_deltas = (0..N) + .map(|idx| { + ( + get_key(idx as u32 / 10), + Lsn(0x10 * ((idx as u64) % 10 + 1)), + Value::Image(Bytes::from(format!("img{idx:05}"))), + ) + }) + .collect_vec(); + let resident_layer = produce_delta_layer(&tenant, &tline, test_deltas.clone(), &ctx) + .await + .unwrap(); + let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap(); + for max_read_size in [1, 1024] { + for batch_size in [1, 2, 4, 8, 3, 7, 13] { + println!("running with batch_size={batch_size} max_read_size={max_read_size}"); + // Test if the batch size is correctly determined + let mut iter = delta_layer.iter(&ctx); + iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + let mut num_items = 0; + for _ in 0..3 { + iter.next_batch().await.unwrap(); + num_items += iter.key_values_batch.len(); + if max_read_size == 1 { + // every key should be a batch b/c the value is larger than max_read_size + assert_eq!(iter.key_values_batch.len(), 1); + } else { + assert_eq!(iter.key_values_batch.len(), batch_size); + } + if num_items >= N { + break; + } + iter.key_values_batch.clear(); + } + // Test if the result is correct + let mut iter = delta_layer.iter(&ctx); + iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size); + assert_delta_iter_equal(&mut iter, &test_deltas).await; + } + } + } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 50aacbd9ad46..1e03e1a58c92 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -369,6 +369,16 @@ impl ImageLayer { } impl ImageLayerInner { + #[cfg(test)] + pub(crate) fn key_range(&self) -> &Range { + &self.key_range + } + + #[cfg(test)] + pub(crate) fn lsn(&self) -> Lsn { + self.lsn + } + /// Returns nested result following Result, Critical>: /// - inner has the success or transient failure /// - outer has the permanent failure diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 6624fb7e6ba5..5941a52e9825 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -6,13 +6,14 @@ //! use crate::config::PageServerConf; use crate::context::{PageContentKind, RequestContext, RequestContextBuilder}; +use crate::page_cache::PAGE_SZ; use crate::repository::{Key, Value}; -use crate::tenant::block_io::BlockReader; +use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef}; use crate::tenant::ephemeral_file::EphemeralFile; use crate::tenant::storage_layer::ValueReconstructResult; use crate::tenant::timeline::GetVectoredError; use crate::tenant::{PageReconstructError, Timeline}; -use crate::{page_cache, walrecord}; +use crate::{l0_flush, page_cache, walrecord}; use anyhow::{anyhow, ensure, Result}; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::InMemoryLayerInfo; @@ -410,6 +411,7 @@ impl InMemoryLayer { continue; } + // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183 let buf = reader.read_blob(block_read.block_offset, &ctx).await; if let Err(e) = buf { reconstruct_state @@ -620,6 +622,13 @@ impl InMemoryLayer { // rare though, so we just accept the potential latency hit for now. let inner = self.inner.read().await; + let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone(); + use l0_flush::Inner; + let _concurrency_permit = match &*l0_flush_global_state { + Inner::PageCached => None, + Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await), + }; + let end_lsn = *self.end_lsn.get().unwrap(); let key_count = if let Some(key_range) = key_range { @@ -645,28 +654,83 @@ impl InMemoryLayer { ) .await?; - let mut buf = Vec::new(); - - let cursor = inner.file.block_cursor(); - - let ctx = RequestContextBuilder::extend(ctx) - .page_content_kind(PageContentKind::InMemoryLayer) - .build(); - for (key, vec_map) in inner.index.iter() { - // Write all page versions - for (lsn, pos) in vec_map.as_slice() { - cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; - let will_init = Value::des(&buf)?.will_init(); - let res; - (buf, res) = delta_layer_writer - .put_value_bytes(*key, *lsn, buf, will_init, &ctx) - .await; - res?; + match &*l0_flush_global_state { + l0_flush::Inner::PageCached => { + let ctx = RequestContextBuilder::extend(ctx) + .page_content_kind(PageContentKind::InMemoryLayer) + .build(); + + let mut buf = Vec::new(); + + let cursor = inner.file.block_cursor(); + + for (key, vec_map) in inner.index.iter() { + // Write all page versions + for (lsn, pos) in vec_map.as_slice() { + cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?; + let will_init = Value::des(&buf)?.will_init(); + let res; + (buf, res) = delta_layer_writer + .put_value_bytes(*key, *lsn, buf, will_init, &ctx) + .await; + res?; + } + } + } + l0_flush::Inner::Direct { .. } => { + let file_contents: Vec = inner.file.load_to_vec(ctx).await?; + assert_eq!( + file_contents.len() % PAGE_SZ, + 0, + "needed by BlockReaderRef::Slice" + ); + assert_eq!(file_contents.len(), { + let written = usize::try_from(inner.file.len()).unwrap(); + if written % PAGE_SZ == 0 { + written + } else { + written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap() + } + }); + + let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents)); + + let mut buf = Vec::new(); + + for (key, vec_map) in inner.index.iter() { + // Write all page versions + for (lsn, pos) in vec_map.as_slice() { + // TODO: once we have blob lengths in the in-memory index, we can + // 1. get rid of the blob_io / BlockReaderRef::Slice business and + // 2. load the file contents into a Bytes and + // 3. the use `Bytes::slice` to get the `buf` that is our blob + // 4. pass that `buf` into `put_value_bytes` + // => https://github.com/neondatabase/neon/issues/8183 + cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?; + let will_init = Value::des(&buf)?.will_init(); + let res; + (buf, res) = delta_layer_writer + .put_value_bytes(*key, *lsn, buf, will_init, ctx) + .await; + res?; + } + } } } // MAX is used here because we identify L0 layers by full key range - let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?; + let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?; + + // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``. + // + // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of + // the `file_contents: Vec` until the IO is done, but not the permit's lifetime. + // Thus, we'd have more concurrenct `Vec` in existence than the semaphore allows. + // + // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages + // we dirtied when writing to the filesystem have been flushed and marked !dirty. + drop(_concurrency_permit); + Ok(Some(delta_layer)) } } diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index 5dd947253578..02069c29d264 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -1096,19 +1096,10 @@ impl LayerInner { match rx.await { Ok(Ok(res)) => Ok(res), - Ok(Err(e)) => { - // sleep already happened in the spawned task, if it was not cancelled - match e.downcast_ref::() { - // If the download failed due to its cancellation token, - // propagate the cancellation error upstream. - Some(remote_storage::DownloadError::Cancelled) => { - Err(DownloadError::DownloadCancelled) - } - // FIXME: this is not embedding the error because historically it would had - // been output to compute, however that is no longer the case. - _ => Err(DownloadError::DownloadFailed), - } + Ok(Err(remote_storage::DownloadError::Cancelled)) => { + Err(DownloadError::DownloadCancelled) } + Ok(Err(_)) => Err(DownloadError::DownloadFailed), Err(_gone) => Err(DownloadError::DownloadCancelled), } } @@ -1118,7 +1109,7 @@ impl LayerInner { timeline: Arc, permit: heavier_once_cell::InitPermit, ctx: &RequestContext, - ) -> anyhow::Result> { + ) -> Result, remote_storage::DownloadError> { let result = timeline .remote_client .download_layer_file( diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs new file mode 100644 index 000000000000..36386c87c999 --- /dev/null +++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs @@ -0,0 +1,412 @@ +use std::{ + cmp::Ordering, + collections::{binary_heap, BinaryHeap}, +}; + +use pageserver_api::key::Key; +use utils::lsn::Lsn; + +use crate::{context::RequestContext, repository::Value}; + +use super::{ + delta_layer::{DeltaLayerInner, DeltaLayerIterator}, + image_layer::{ImageLayerInner, ImageLayerIterator}, +}; + +#[derive(Clone, Copy)] +enum LayerRef<'a> { + Image(&'a ImageLayerInner), + Delta(&'a DeltaLayerInner), +} + +impl<'a> LayerRef<'a> { + fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> { + match self { + Self::Image(x) => LayerIterRef::Image(x.iter(ctx)), + Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)), + } + } +} + +enum LayerIterRef<'a> { + Image(ImageLayerIterator<'a>), + Delta(DeltaLayerIterator<'a>), +} + +impl LayerIterRef<'_> { + async fn next(&mut self) -> anyhow::Result> { + match self { + Self::Delta(x) => x.next().await, + Self::Image(x) => x.next().await, + } + } +} + +/// This type plays several roles at once +/// 1. Unified iterator for image and delta layers. +/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge). +/// 3. Lazy creation of the real delta/image iterator. +enum IteratorWrapper<'a> { + NotLoaded { + ctx: &'a RequestContext, + first_key_lower_bound: (Key, Lsn), + layer: LayerRef<'a>, + }, + Loaded { + iter: PeekableLayerIterRef<'a>, + }, +} + +struct PeekableLayerIterRef<'a> { + iter: LayerIterRef<'a>, + peeked: Option<(Key, Lsn, Value)>, // None == end +} + +impl<'a> PeekableLayerIterRef<'a> { + async fn create(mut iter: LayerIterRef<'a>) -> anyhow::Result { + let peeked = iter.next().await?; + Ok(Self { iter, peeked }) + } + + fn peek(&self) -> &Option<(Key, Lsn, Value)> { + &self.peeked + } + + async fn next(&mut self) -> anyhow::Result> { + let result = self.peeked.take(); + self.peeked = self.iter.next().await?; + Ok(result) + } +} + +impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> { + fn eq(&self, other: &Self) -> bool { + self.cmp(other) == Ordering::Equal + } +} + +impl<'a> std::cmp::Eq for IteratorWrapper<'a> {} + +impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl<'a> std::cmp::Ord for IteratorWrapper<'a> { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + use std::cmp::Ordering; + let a = self.peek_next_key_lsn(); + let b = other.peek_next_key_lsn(); + match (a, b) { + (Some((k1, l1)), Some((k2, l2))) => { + let loaded_1 = if self.is_loaded() { 1 } else { 0 }; + let loaded_2 = if other.is_loaded() { 1 } else { 0 }; + // When key_lsn are the same, the unloaded iter will always appear before the loaded one. + // And note that we do a reverse at the end of the comparison, so it works with the max heap. + (k1, l1, loaded_1).cmp(&(k2, l2, loaded_2)) + } + (Some(_), None) => Ordering::Less, + (None, Some(_)) => Ordering::Greater, + (None, None) => Ordering::Equal, + } + .reverse() + } +} + +impl<'a> IteratorWrapper<'a> { + pub fn create_from_image_layer( + image_layer: &'a ImageLayerInner, + ctx: &'a RequestContext, + ) -> Self { + Self::NotLoaded { + layer: LayerRef::Image(image_layer), + first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()), + ctx, + } + } + + pub fn create_from_delta_layer( + delta_layer: &'a DeltaLayerInner, + ctx: &'a RequestContext, + ) -> Self { + Self::NotLoaded { + layer: LayerRef::Delta(delta_layer), + first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start), + ctx, + } + } + + fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> { + match self { + Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)), + Self::NotLoaded { + first_key_lower_bound: (key, lsn), + .. + } => Some((key, *lsn)), + } + } + + // CORRECTNESS: this function must always take `&mut self`, never `&self`. + // + // The reason is that `impl Ord for Self` evaluates differently after this function + // returns. We're called through a `PeekMut::deref_mut`, which causes heap repair when + // the PeekMut gets returned. So, it's critical that we actually run through `PeekMut::deref_mut` + // and not just `PeekMut::deref` + // If we don't take `&mut self` + async fn load(&mut self) -> anyhow::Result<()> { + assert!(!self.is_loaded()); + let Self::NotLoaded { + ctx, + first_key_lower_bound, + layer, + } = self + else { + unreachable!() + }; + let iter = layer.iter(ctx); + let iter = PeekableLayerIterRef::create(iter).await?; + if let Some((k1, l1, _)) = iter.peek() { + let (k2, l2) = first_key_lower_bound; + debug_assert!((k1, l1) >= (k2, l2)); + } + *self = Self::Loaded { iter }; + Ok(()) + } + + fn is_loaded(&self) -> bool { + matches!(self, Self::Loaded { .. }) + } + + /// Correctness: must load the iterator before using. + /// + /// Given this iterator wrapper is private to the merge iterator, users won't be able to mis-use it. + /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and + /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`]. + async fn next(&mut self) -> anyhow::Result> { + let Self::Loaded { iter } = self else { + panic!("must load the iterator before using") + }; + iter.next().await + } +} + +pub struct MergeIterator<'a> { + heap: BinaryHeap>, +} + +impl<'a> MergeIterator<'a> { + pub fn create( + deltas: &[&'a DeltaLayerInner], + images: &[&'a ImageLayerInner], + ctx: &'a RequestContext, + ) -> Self { + let mut heap = Vec::with_capacity(images.len() + deltas.len()); + for image in images { + heap.push(IteratorWrapper::create_from_image_layer(image, ctx)); + } + for delta in deltas { + heap.push(IteratorWrapper::create_from_delta_layer(delta, ctx)); + } + Self { + heap: BinaryHeap::from(heap), + } + } + + pub async fn next(&mut self) -> anyhow::Result> { + while let Some(mut iter) = self.heap.peek_mut() { + if !iter.is_loaded() { + // Once we load the iterator, we can know the real first key-value pair in the iterator. + // We put it back into the heap so that a potentially unloaded layer may have a key between + // [potential_first_key, loaded_first_key). + iter.load().await?; + continue; + } + let Some(item) = iter.next().await? else { + // If the iterator returns None, we pop this iterator. Actually, in the current implementation, + // we order None > Some, and all the rest of the iterators should return None. + binary_heap::PeekMut::pop(iter); + continue; + }; + return Ok(Some(item)); + } + Ok(None) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + use itertools::Itertools; + use pageserver_api::key::Key; + use utils::lsn::Lsn; + + use crate::{ + tenant::{ + harness::{TenantHarness, TIMELINE_ID}, + storage_layer::delta_layer::test::{produce_delta_layer, sort_delta}, + }, + DEFAULT_PG_VERSION, + }; + + async fn assert_merge_iter_equal( + merge_iter: &mut MergeIterator<'_>, + expect: &[(Key, Lsn, Value)], + ) { + let mut expect_iter = expect.iter(); + loop { + let o1 = merge_iter.next().await.unwrap(); + let o2 = expect_iter.next(); + assert_eq!(o1.is_some(), o2.is_some()); + if o1.is_none() && o2.is_none() { + break; + } + let (k1, l1, v1) = o1.unwrap(); + let (k2, l2, v2) = o2.unwrap(); + assert_eq!(&k1, k2); + assert_eq!(l1, *l2); + assert_eq!(&v1, v2); + } + } + + #[tokio::test] + async fn merge_in_between() { + use crate::repository::Value; + use bytes::Bytes; + + let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + let test_deltas1 = vec![ + ( + get_key(0), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ( + get_key(5), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ]; + let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) + .await + .unwrap(); + let test_deltas2 = vec![ + ( + get_key(3), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ( + get_key(4), + Lsn(0x10), + Value::Image(Bytes::copy_from_slice(b"test")), + ), + ]; + let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) + .await + .unwrap(); + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + let mut expect = Vec::new(); + expect.extend(test_deltas1); + expect.extend(test_deltas2); + expect.sort_by(sort_delta); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + } + + #[tokio::test] + async fn delta_merge() { + use crate::repository::Value; + use bytes::Bytes; + + let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap(); + let (tenant, ctx) = harness.load().await; + + let tline = tenant + .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx) + .await + .unwrap(); + + fn get_key(id: u32) -> Key { + let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap(); + key.field6 = id; + key + } + const N: usize = 1000; + let test_deltas1 = (0..N) + .map(|idx| { + ( + get_key(idx as u32 / 10), + Lsn(0x20 * ((idx as u64) % 10 + 1)), + Value::Image(Bytes::from(format!("img{idx:05}"))), + ) + }) + .collect_vec(); + let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx) + .await + .unwrap(); + let test_deltas2 = (0..N) + .map(|idx| { + ( + get_key(idx as u32 / 10), + Lsn(0x20 * ((idx as u64) % 10 + 1) + 0x10), + Value::Image(Bytes::from(format!("img{idx:05}"))), + ) + }) + .collect_vec(); + let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx) + .await + .unwrap(); + let test_deltas3 = (0..N) + .map(|idx| { + ( + get_key(idx as u32 / 10 + N as u32), + Lsn(0x10 * ((idx as u64) % 10 + 1)), + Value::Image(Bytes::from(format!("img{idx:05}"))), + ) + }) + .collect_vec(); + let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx) + .await + .unwrap(); + let mut merge_iter = MergeIterator::create( + &[ + resident_layer_1.get_as_delta(&ctx).await.unwrap(), + resident_layer_2.get_as_delta(&ctx).await.unwrap(), + resident_layer_3.get_as_delta(&ctx).await.unwrap(), + ], + &[], + &ctx, + ); + let mut expect = Vec::new(); + expect.extend(test_deltas1); + expect.extend(test_deltas2); + expect.extend(test_deltas3); + expect.sort_by(sort_delta); + assert_merge_iter_equal(&mut merge_iter, &expect).await; + + // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge + } + + // TODO: image layer merge, delta+image mixed merge + // TODO: is it possible to have duplicated delta at same LSN now? we might need to test that +} diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 1175b750179d..762e903bf85d 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -14,6 +14,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result}; use arc_swap::ArcSwap; use bytes::Bytes; use camino::Utf8Path; +use chrono::{DateTime, Utc}; use enumset::EnumSet; use fail::fail_point; use once_cell::sync::Lazy; @@ -65,13 +66,12 @@ use std::{ ops::{Deref, Range}, }; -use crate::metrics::GetKind; -use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS; use crate::{ aux_file::AuxFileSizeEstimator, tenant::{ layer_map::{LayerMap, SearchResult}, metadata::TimelineMetadata, + storage_layer::PersistentLayerDesc, }, }; use crate::{ @@ -90,10 +90,15 @@ use crate::{ use crate::{ disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry, }; +use crate::{ + l0_flush::{self, L0FlushGlobalState}, + metrics::GetKind, +}; use crate::{ metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize, }; use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind}; +use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey}; use crate::{ pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind}, virtual_file::{MaybeFatalIo, VirtualFile}, @@ -208,6 +213,7 @@ pub struct TimelineResources { pub timeline_get_throttle: Arc< crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>, >, + pub l0_flush_global_state: l0_flush::L0FlushGlobalState, } pub(crate) struct AuxFilesState { @@ -360,6 +366,7 @@ pub struct Timeline { repartition_threshold: u64, last_image_layer_creation_check_at: AtomicLsn, + last_image_layer_creation_check_instant: std::sync::Mutex>, /// Current logical size of the "datadir", at the last LSN. current_logical_size: LogicalSize, @@ -433,6 +440,8 @@ pub struct Timeline { /// in the future, add `extra_test_sparse_keyspace` if necessary. #[cfg(test)] pub(crate) extra_test_dense_keyspace: ArcSwap, + + pub(crate) l0_flush_global_state: L0FlushGlobalState, } pub struct WalReceiverInfo { @@ -457,6 +466,9 @@ pub(crate) struct GcInfo { /// Leases granted to particular LSNs. pub(crate) leases: BTreeMap, + + /// Whether our branch point is within our ancestor's PITR interval (for cost estimation) + pub(crate) within_ancestor_pitr: bool, } impl GcInfo { @@ -717,6 +729,9 @@ impl From for CompactionError { fn from(e: CreateImageLayersError) -> Self { match e { CreateImageLayersError::Cancelled => CompactionError::ShuttingDown, + CreateImageLayersError::Other(e) => { + CompactionError::Other(e.context("create image layers")) + } _ => CompactionError::Other(e.into()), } } @@ -845,6 +860,18 @@ impl Timeline { .map(|ancestor| ancestor.timeline_id) } + /// Get the bytes written since the PITR cutoff on this branch, and + /// whether this branch's ancestor_lsn is within its parent's PITR. + pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) { + let gc_info = self.gc_info.read().unwrap(); + let history = self + .get_last_record_lsn() + .checked_sub(gc_info.cutoffs.pitr) + .unwrap_or(Lsn(0)) + .0; + (history, gc_info.within_ancestor_pitr) + } + /// Lock and get timeline's GC cutoff pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard { self.latest_gc_cutoff_lsn.read() @@ -996,6 +1023,7 @@ impl Timeline { } pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32; + pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0; /// Look up multiple page versions at a given LSN /// @@ -1228,7 +1256,7 @@ impl Timeline { let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME .for_get_kind(get_kind) .start_timer(); - self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx) + self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx) .await?; get_data_timer.stop_and_record(); @@ -1258,11 +1286,25 @@ impl Timeline { // (this is a requirement, not a bug). Skip updating the metric in these cases // to avoid infinite results. if !results.is_empty() { + let avg = layers_visited as f64 / results.len() as f64; + if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH { + use utils::rate_limit::RateLimit; + static LOGGED: Lazy> = + Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60)))); + let mut rate_limit = LOGGED.lock().unwrap(); + rate_limit.call(|| { + tracing::info!( + shard_id = %self.tenant_shard_id.shard_slug(), + lsn = %lsn, + "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned", + keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size()); + }); + } + // Note that this is an approximation. Tracking the exact number of layers visited // per key requires virtually unbounded memory usage and is inefficient // (i.e. segment tree tracking each range queried from a layer) - crate::metrics::VEC_READ_NUM_LAYERS_VISITED - .observe(layers_visited as f64 / results.len() as f64); + crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg); } Ok(results) @@ -1554,7 +1596,13 @@ impl Timeline { let existing_lease = occupied.get_mut(); if valid_until > existing_lease.valid_until { existing_lease.valid_until = valid_until; + let dt: DateTime = valid_until.into(); + info!("lease extended to {}", dt); + } else { + let dt: DateTime = existing_lease.valid_until.into(); + info!("existing lease covers greater length, valid until {}", dt); } + existing_lease.clone() } else { // Reject already GC-ed LSN (lsn < latest_gc_cutoff) @@ -1563,6 +1611,8 @@ impl Timeline { bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn); } + let dt: DateTime = valid_until.into(); + info!("lease created, valid until {}", dt); entry.or_insert(LsnLease { valid_until }).clone() } }; @@ -2339,6 +2389,7 @@ impl Timeline { )), repartition_threshold: 0, last_image_layer_creation_check_at: AtomicLsn::new(0), + last_image_layer_creation_check_instant: Mutex::new(None), last_received_wal: Mutex::new(None), rel_size_cache: RwLock::new(RelSizeCache { @@ -2376,6 +2427,8 @@ impl Timeline { #[cfg(test)] extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())), + + l0_flush_global_state: resources.l0_flush_global_state, }; result.repartition_threshold = result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE; @@ -4417,6 +4470,58 @@ impl Timeline { } } + /// Predicate function which indicates whether we should check if new image layers + /// are required. Since checking if new image layers are required is expensive in + /// terms of CPU, we only do it in the following cases: + /// 1. If the timeline has ingested sufficient WAL to justify the cost + /// 2. If enough time has passed since the last check + /// 2.1. For large tenants, we wish to perform the check more often since they + /// suffer from the lack of image layers + /// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval + fn should_check_if_image_layers_required(self: &Arc, lsn: Lsn) -> bool { + const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024; + + let last_checks_at = self.last_image_layer_creation_check_at.load(); + let distance = lsn + .checked_sub(last_checks_at) + .expect("Attempt to compact with LSN going backwards"); + let min_distance = + self.get_image_layer_creation_check_threshold() as u64 * self.get_checkpoint_distance(); + + let distance_based_decision = distance.0 >= min_distance; + + let mut time_based_decision = false; + let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap(); + if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() { + let check_required_after = if Into::::into(&logical_size) >= LARGE_TENANT_THRESHOLD + { + self.get_checkpoint_timeout() + } else { + Duration::from_secs(3600 * 48) + }; + + time_based_decision = match *last_check_instant { + Some(last_check) => { + let elapsed = last_check.elapsed(); + elapsed >= check_required_after + } + None => true, + }; + } + + // Do the expensive delta layer counting only if this timeline has ingested sufficient + // WAL since the last check or a checkpoint timeout interval has elapsed since the last + // check. + let decision = distance_based_decision || time_based_decision; + + if decision { + self.last_image_layer_creation_check_at.store(lsn); + *last_check_instant = Some(Instant::now()); + } + + decision + } + #[tracing::instrument(skip_all, fields(%lsn, %mode))] async fn create_image_layers( self: &Arc, @@ -4439,22 +4544,7 @@ impl Timeline { // image layers <100000000..100000099> and <200000000..200000199> are not completely covering it. let mut start = Key::MIN; - let check_for_image_layers = { - let last_checks_at = self.last_image_layer_creation_check_at.load(); - let distance = lsn - .checked_sub(last_checks_at) - .expect("Attempt to compact with LSN going backwards"); - let min_distance = self.get_image_layer_creation_check_threshold() as u64 - * self.get_checkpoint_distance(); - - // Skip the expensive delta layer counting if this timeline has not ingested sufficient - // WAL since the last check. - distance.0 >= min_distance - }; - - if check_for_image_layers { - self.last_image_layer_creation_check_at.store(lsn); - } + let check_for_image_layers = self.should_check_if_image_layers_required(lsn); for partition in partitioning.parts.iter() { let img_range = start..partition.ranges.last().unwrap().end; @@ -4483,6 +4573,22 @@ impl Timeline { start = img_range.end; continue; } + } else if let ImageLayerCreationMode::Force = mode { + // When forced to create image layers, we might try and create them where they already + // exist. This mode is only used in tests/debug. + let layers = self.layers.read().await; + if layers.contains_key(&PersistentLayerKey { + key_range: img_range.clone(), + lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn), + is_delta: false, + }) { + tracing::info!( + "Skipping image layer at {lsn} {}..{}, already exists", + img_range.start, + img_range.end + ); + continue; + } } let image_layer_writer = ImageLayerWriter::new( @@ -4711,6 +4817,42 @@ impl DurationRecorder { } } +/// Descriptor for a delta layer used in testing infra. The start/end key/lsn range of the +/// delta layer might be different from the min/max key/lsn in the delta layer. Therefore, +/// the layer descriptor requires the user to provide the ranges, which should cover all +/// keys specified in the `data` field. +#[cfg(test)] +pub struct DeltaLayerTestDesc { + pub lsn_range: Range, + pub key_range: Range, + pub data: Vec<(Key, Lsn, Value)>, +} + +#[cfg(test)] +impl DeltaLayerTestDesc { + #[allow(dead_code)] + pub fn new(lsn_range: Range, key_range: Range, data: Vec<(Key, Lsn, Value)>) -> Self { + Self { + lsn_range, + key_range, + data, + } + } + + pub fn new_with_inferred_key_range( + lsn_range: Range, + data: Vec<(Key, Lsn, Value)>, + ) -> Self { + let key_min = data.iter().map(|(key, _, _)| key).min().unwrap(); + let key_max = data.iter().map(|(key, _, _)| key).max().unwrap(); + Self { + key_range: (*key_min)..(key_max.next()), + lsn_range, + data, + } + } +} + impl Timeline { async fn finish_compact_batch( self: &Arc, @@ -5481,12 +5623,12 @@ impl Timeline { } images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb)); let min_key = *images.first().map(|(k, _)| k).unwrap(); - let max_key = images.last().map(|(k, _)| k).unwrap().next(); + let end_key = images.last().map(|(k, _)| k).unwrap().next(); let mut image_layer_writer = ImageLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, - &(min_key..max_key), + &(min_key..end_key), lsn, ctx, ) @@ -5511,37 +5653,65 @@ impl Timeline { #[cfg(test)] pub(super) async fn force_create_delta_layer( self: &Arc, - mut deltas: Vec<(Key, Lsn, Value)>, + mut deltas: DeltaLayerTestDesc, check_start_lsn: Option, ctx: &RequestContext, ) -> anyhow::Result<()> { let last_record_lsn = self.get_last_record_lsn(); - deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); - let min_key = *deltas.first().map(|(k, _, _)| k).unwrap(); - let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next(); - let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap(); - let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap(); + deltas + .data + .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb))); + assert!(deltas.data.first().unwrap().0 >= deltas.key_range.start); + assert!(deltas.data.last().unwrap().0 < deltas.key_range.end); + for (_, lsn, _) in &deltas.data { + assert!(deltas.lsn_range.start <= *lsn && *lsn < deltas.lsn_range.end); + } assert!( - max_lsn <= last_record_lsn, - "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}" + deltas.lsn_range.end <= last_record_lsn, + "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}", + deltas.lsn_range.end, + last_record_lsn ); - let end_lsn = Lsn(max_lsn.0 + 1); if let Some(check_start_lsn) = check_start_lsn { - assert!(min_lsn >= check_start_lsn); + assert!(deltas.lsn_range.start >= check_start_lsn); + } + // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of + // layers of the same start/end LSN, and so should the force inserted layer + { + /// Checks if a overlaps with b, assume a/b = [start, end). + pub fn overlaps_with(a: &Range, b: &Range) -> bool { + !(a.end <= b.start || b.end <= a.start) + } + + let guard = self.layers.read().await; + for layer in guard.layer_map().iter_historic_layers() { + if layer.is_delta() + && overlaps_with(&layer.lsn_range, &deltas.lsn_range) + && layer.lsn_range != deltas.lsn_range + { + // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic + panic!( + "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}", + deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end + ); + } + } } let mut delta_layer_writer = DeltaLayerWriter::new( self.conf, self.timeline_id, self.tenant_shard_id, - min_key, - min_lsn..end_lsn, + deltas.key_range.start, + deltas.lsn_range, ctx, ) .await?; - for (key, lsn, val) in deltas { + for (key, lsn, val) in deltas.data { delta_layer_writer.put_value(key, lsn, val, ctx).await?; } - let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?; + let delta_layer = delta_layer_writer + .finish(deltas.key_range.end, self, ctx) + .await?; { let mut guard = self.layers.write().await; diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs index 6d747d424dde..b0088f4ea228 100644 --- a/pageserver/src/tenant/timeline/delete.rs +++ b/pageserver/src/tenant/timeline/delete.rs @@ -272,6 +272,7 @@ impl DeleteTimelineFlow { TimelineResources { remote_client, timeline_get_throttle: tenant.timeline_get_throttle.clone(), + l0_flush_global_state: tenant.l0_flush_global_state.clone(), }, // Important. We dont pass ancestor above because it can be missing. // Thus we need to skip the validation here. diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs index 948237e06a5e..a43ff873acb5 100644 --- a/pageserver/src/tenant/timeline/layer_manager.rs +++ b/pageserver/src/tenant/timeline/layer_manager.rs @@ -339,6 +339,10 @@ impl LayerManager { self.layer_fmgr.contains(layer) } + pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool { + self.layer_fmgr.contains_key(key) + } + pub(crate) fn all_persistent_layers(&self) -> Vec { self.layer_fmgr.0.keys().cloned().collect_vec() } @@ -363,6 +367,10 @@ impl LayerFileManager { .clone() } + fn contains_key(&self, key: &PersistentLayerKey) -> bool { + self.0.contains_key(key) + } + pub(crate) fn insert(&mut self, layer: T) { let present = self.0.insert(layer.layer_desc().key(), layer.clone()); if present.is_some() && cfg!(debug_assertions) { diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs index c6ee6b90c4d1..a66900522af4 100644 --- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs +++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs @@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument}; use super::TaskStateUpdate; use crate::{ context::RequestContext, - metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, + metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST}, task_mgr::TaskKind, task_mgr::WALRECEIVER_RUNTIME, tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo}, @@ -208,14 +208,9 @@ pub(super) async fn handle_walreceiver_connection( .instrument(tracing::info_span!("poller")), ); - // Immediately increment the gauge, then create a job to decrement it on task exit. - // One of the pros of `defer!` is that this will *most probably* - // get called, even in presence of panics. - let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]); - gauge.inc(); - scopeguard::defer! { - gauge.dec(); - } + let _guard = LIVE_CONNECTIONS + .with_label_values(&["wal_receiver"]) + .guard(); let identify = identify_system(&replication_client).await?; info!("{identify:?}"); diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs index 1241a1390209..7ad8446e0411 100644 --- a/pageserver/src/tenant/vectored_blob_io.rs +++ b/pageserver/src/tenant/vectored_blob_io.rs @@ -20,6 +20,7 @@ use std::num::NonZeroUsize; use bytes::BytesMut; use pageserver_api::key::Key; +use tokio_epoll_uring::BoundedBuf; use utils::lsn::Lsn; use utils::vec_map::VecMap; @@ -316,8 +317,9 @@ impl<'a> VectoredBlobReader<'a> { ); let buf = self .file - .read_exact_at_n(buf, read.start, read.size(), ctx) - .await?; + .read_exact_at(buf.slice(0..read.size()), read.start, ctx) + .await? + .into_inner(); let blobs_at = read.blobs_at.as_slice(); let start_offset = blobs_at.first().expect("VectoredRead is never empty").0; diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs index 04d9386fab92..51b0c420c346 100644 --- a/pageserver/src/virtual_file.rs +++ b/pageserver/src/virtual_file.rs @@ -13,7 +13,7 @@ use crate::context::RequestContext; use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC}; -use crate::page_cache::PageWriteGuard; +use crate::page_cache::{PageWriteGuard, PAGE_SZ}; use crate::tenant::TENANTS_SEGMENT_NAME; use camino::{Utf8Path, Utf8PathBuf}; use once_cell::sync::OnceCell; @@ -48,6 +48,7 @@ pub(crate) mod owned_buffers_io { //! but for the time being we're proving out the primitives in the neon.git repo //! for faster iteration. + pub(crate) mod slice; pub(crate) mod write; pub(crate) mod util { pub(crate) mod size_tracking_writer; @@ -143,16 +144,17 @@ struct SlotInner { /// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`]. struct PageWriteGuardBuf { page: PageWriteGuard<'static>, - init_up_to: usize, } // Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot, // and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved. +// Page cache pages are zero-initialized, so, wrt uninitialized memory we're good. +// (Page cache tracks separately whether the contents are valid, see `PageWriteGuard::mark_valid`.) unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf { fn stable_ptr(&self) -> *const u8 { self.page.as_ptr() } fn bytes_init(&self) -> usize { - self.init_up_to + self.page.len() } fn bytes_total(&self) -> usize { self.page.len() @@ -166,8 +168,8 @@ unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf { } unsafe fn set_init(&mut self, pos: usize) { + // There shouldn't really be any reason to call this API since bytes_init() == bytes_total(). assert!(pos <= self.page.len()); - self.init_up_to = pos; } } @@ -585,37 +587,37 @@ impl VirtualFile { Ok(self.pos) } - pub async fn read_exact_at( + /// Read the file contents in range `offset..(offset + slice.bytes_total())` into `slice[0..slice.bytes_total()]`. + /// + /// The returned `Slice` is equivalent to the input `slice`, i.e., it's the same view into the same buffer. + pub async fn read_exact_at( &self, - buf: B, + slice: Slice, offset: u64, ctx: &RequestContext, - ) -> Result + ) -> Result, Error> where - B: IoBufMut + Send, + Buf: IoBufMut + Send, { - let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| { - self.read_at(buf, offset, ctx) - }) - .await; - res.map(|()| buf) - } + let assert_we_return_original_bounds = if cfg!(debug_assertions) { + Some((slice.stable_ptr() as usize, slice.bytes_total())) + } else { + None + }; - pub async fn read_exact_at_n( - &self, - buf: B, - offset: u64, - count: usize, - ctx: &RequestContext, - ) -> Result - where - B: IoBufMut + Send, - { - let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| { - self.read_at(buf, offset, ctx) - }) - .await; - res.map(|()| buf) + let original_bounds = slice.bounds(); + let (buf, res) = + read_exact_at_impl(slice, offset, |buf, offset| self.read_at(buf, offset, ctx)).await; + let res = res.map(|_| buf.slice(original_bounds)); + + if let Some(original_bounds) = assert_we_return_original_bounds { + if let Ok(slice) = &res { + let returned_bounds = (slice.stable_ptr() as usize, slice.bytes_total()); + assert_eq!(original_bounds, returned_bounds); + } + } + + res } /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`]. @@ -625,13 +627,11 @@ impl VirtualFile { offset: u64, ctx: &RequestContext, ) -> Result, Error> { - let buf = PageWriteGuardBuf { - page, - init_up_to: 0, - }; - let res = self.read_exact_at(buf, offset, ctx).await; - res.map(|PageWriteGuardBuf { page, .. }| page) - .map_err(|e| Error::new(ErrorKind::Other, e)) + let buf = PageWriteGuardBuf { page }.slice_full(); + debug_assert_eq!(buf.bytes_total(), PAGE_SZ); + self.read_exact_at(buf, offset, ctx) + .await + .map(|slice| slice.into_inner().page) } // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235 @@ -722,14 +722,14 @@ impl VirtualFile { (buf, Ok(n)) } - pub(crate) async fn read_at( + pub(crate) async fn read_at( &self, - buf: B, + buf: tokio_epoll_uring::Slice, offset: u64, _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */ - ) -> (B, Result) + ) -> (tokio_epoll_uring::Slice, Result) where - B: tokio_epoll_uring::BoundedBufMut + Send, + Buf: tokio_epoll_uring::IoBufMut + Send, { let file_guard = match self.lock_file().await { Ok(file_guard) => file_guard, @@ -781,26 +781,16 @@ impl VirtualFile { } // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135 -pub async fn read_exact_at_impl( - buf: B, +pub async fn read_exact_at_impl( + mut buf: tokio_epoll_uring::Slice, mut offset: u64, - count: Option, mut read_at: F, -) -> (B, std::io::Result<()>) +) -> (Buf, std::io::Result<()>) where - B: IoBufMut + Send, - F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, - Fut: std::future::Future, std::io::Result)>, + Buf: IoBufMut + Send, + F: FnMut(tokio_epoll_uring::Slice, u64) -> Fut, + Fut: std::future::Future, std::io::Result)>, { - let mut buf: tokio_epoll_uring::Slice = match count { - Some(count) => { - assert!(count <= buf.bytes_total()); - assert!(count > 0); - buf.slice(..count) // may include uninitialized memory - } - None => buf.slice_full(), // includes all the uninitialized memory - }; - while buf.bytes_total() != 0 { let res; (buf, res) = read_at(buf, offset).await; @@ -882,7 +872,7 @@ mod test_read_exact_at_impl { #[tokio::test] async fn test_basic() { - let buf = Vec::with_capacity(5); + let buf = Vec::with_capacity(5).slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::from(vec![Expectation { offset: 0, @@ -890,7 +880,7 @@ mod test_read_exact_at_impl { result: Ok(vec![b'a', b'b', b'c', b'd', b'e']), }]), })); - let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -899,33 +889,13 @@ mod test_read_exact_at_impl { assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']); } - #[tokio::test] - async fn test_with_count() { - let buf = Vec::with_capacity(5); - let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { - expectations: VecDeque::from(vec![Expectation { - offset: 0, - bytes_total: 3, - result: Ok(vec![b'a', b'b', b'c']), - }]), - })); - - let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| { - let mock_read_at = Arc::clone(&mock_read_at); - async move { mock_read_at.lock().await.read_at(buf, offset).await } - }) - .await; - assert!(res.is_ok()); - assert_eq!(buf, vec![b'a', b'b', b'c']); - } - #[tokio::test] async fn test_empty_buf_issues_no_syscall() { - let buf = Vec::new(); + let buf = Vec::new().slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::new(), })); - let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -935,7 +905,7 @@ mod test_read_exact_at_impl { #[tokio::test] async fn test_two_read_at_calls_needed_until_buf_filled() { - let buf = Vec::with_capacity(4); + let buf = Vec::with_capacity(4).slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::from(vec![ Expectation { @@ -950,7 +920,7 @@ mod test_read_exact_at_impl { }, ]), })); - let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -961,7 +931,7 @@ mod test_read_exact_at_impl { #[tokio::test] async fn test_eof_before_buffer_full() { - let buf = Vec::with_capacity(3); + let buf = Vec::with_capacity(3).slice_full(); let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt { expectations: VecDeque::from(vec![ Expectation { @@ -981,7 +951,7 @@ mod test_read_exact_at_impl { }, ]), })); - let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| { + let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| { let mock_read_at = Arc::clone(&mock_read_at); async move { mock_read_at.lock().await.read_at(buf, offset).await } }) @@ -1051,27 +1021,29 @@ impl VirtualFile { ctx: &RequestContext, ) -> Result, std::io::Error> { use crate::page_cache::PAGE_SZ; - let buf = vec![0; PAGE_SZ]; - let buf = self - .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx) + let slice = Vec::with_capacity(PAGE_SZ).slice_full(); + assert_eq!(slice.bytes_total(), PAGE_SZ); + let slice = self + .read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx) .await?; - Ok(crate::tenant::block_io::BlockLease::Vec(buf)) + Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner())) } async fn read_to_end(&mut self, buf: &mut Vec, ctx: &RequestContext) -> Result<(), Error> { let mut tmp = vec![0; 128]; loop { - let res; - (tmp, res) = self.read_at(tmp, self.pos, ctx).await; + let slice = tmp.slice(..128); + let (slice, res) = self.read_at(slice, self.pos, ctx).await; match res { Ok(0) => return Ok(()), Ok(n) => { self.pos += n as u64; - buf.extend_from_slice(&tmp[..n]); + buf.extend_from_slice(&slice[..n]); } Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {} Err(e) => return Err(e), } + tmp = slice.into_inner(); } } } @@ -1185,6 +1157,7 @@ mod tests { use crate::task_mgr::TaskKind; use super::*; + use owned_buffers_io::slice::SliceExt; use rand::seq::SliceRandom; use rand::thread_rng; use rand::Rng; @@ -1206,13 +1179,16 @@ mod tests { impl MaybeVirtualFile { async fn read_exact_at( &self, - mut buf: Vec, + mut slice: tokio_epoll_uring::Slice>, offset: u64, ctx: &RequestContext, - ) -> Result, Error> { + ) -> Result>, Error> { match self { - MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await, - MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf), + MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await, + MaybeVirtualFile::File(file) => { + let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed(); + file.read_exact_at(rust_slice, offset).map(|()| slice) + } } } async fn write_all_at, Buf: IoBuf + Send>( @@ -1286,9 +1262,12 @@ mod tests { len: usize, ctx: &RequestContext, ) -> Result { - let buf = vec![0; len]; - let buf = self.read_exact_at(buf, pos, ctx).await?; - Ok(String::from_utf8(buf).unwrap()) + let slice = Vec::with_capacity(len).slice_full(); + assert_eq!(slice.bytes_total(), len); + let slice = self.read_exact_at(slice, pos, ctx).await?; + let vec = slice.into_inner(); + assert_eq!(vec.len(), len); + Ok(String::from_utf8(vec).unwrap()) } } @@ -1507,7 +1486,11 @@ mod tests { let mut rng = rand::rngs::OsRng; for _ in 1..1000 { let f = &files[rng.gen_range(0..files.len())]; - buf = f.read_exact_at(buf, 0, &ctx).await.unwrap(); + buf = f + .read_exact_at(buf.slice_full(), 0, &ctx) + .await + .unwrap() + .into_inner(); assert!(buf == SAMPLE); } }); diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs index 7a27be2ca108..2820cea097d1 100644 --- a/pageserver/src/virtual_file/io_engine.rs +++ b/pageserver/src/virtual_file/io_engine.rs @@ -107,7 +107,7 @@ use std::{ sync::atomic::{AtomicU8, Ordering}, }; -use super::{FileGuard, Metadata}; +use super::{owned_buffers_io::slice::SliceExt, FileGuard, Metadata}; #[cfg(target_os = "linux")] fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std::io::Error { @@ -120,38 +120,29 @@ fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error) -> std: } impl IoEngine { - pub(super) async fn read_at( + pub(super) async fn read_at( &self, file_guard: FileGuard, offset: u64, - mut buf: B, - ) -> ((FileGuard, B), std::io::Result) + mut slice: tokio_epoll_uring::Slice, + ) -> ( + (FileGuard, tokio_epoll_uring::Slice), + std::io::Result, + ) where - B: tokio_epoll_uring::BoundedBufMut + Send, + Buf: tokio_epoll_uring::IoBufMut + Send, { match self { IoEngine::NotSet => panic!("not initialized"), IoEngine::StdFs => { - // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory. - let dst = unsafe { - std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total()) - }; - let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset)); - if let Ok(nbytes) = &res { - assert!(*nbytes <= buf.bytes_total()); - // SAFETY: see above assertion - unsafe { - buf.set_init(*nbytes); - } - } - #[allow(dropping_references)] - drop(dst); - ((file_guard, buf), res) + let rust_slice = slice.as_mut_rust_slice_full_zeroed(); + let res = file_guard.with_std_file(|std_file| std_file.read_at(rust_slice, offset)); + ((file_guard, slice), res) } #[cfg(target_os = "linux")] IoEngine::TokioEpollUring => { let system = tokio_epoll_uring_ext::thread_local_system().await; - let (resources, res) = system.read(file_guard, offset, buf).await; + let (resources, res) = system.read(file_guard, offset, slice).await; (resources, res.map_err(epoll_uring_error_to_std)) } } diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs new file mode 100644 index 000000000000..d19e5ddffefb --- /dev/null +++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs @@ -0,0 +1,121 @@ +use tokio_epoll_uring::BoundedBuf; +use tokio_epoll_uring::BoundedBufMut; +use tokio_epoll_uring::IoBufMut; +use tokio_epoll_uring::Slice; + +pub(crate) trait SliceExt { + /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO. + /// + /// See the test case `test_slice_full_zeroed` for the difference to just doing `&slice[..]` + fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8]; +} + +impl SliceExt for Slice +where + B: IoBufMut, +{ + #[inline(always)] + fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8] { + // zero-initialize the uninitialized parts of the buffer so we can create a Rust slice + // + // SAFETY: we own `slice`, don't write outside the bounds + unsafe { + let to_init = self.bytes_total() - self.bytes_init(); + self.stable_mut_ptr() + .add(self.bytes_init()) + .write_bytes(0, to_init); + self.set_init(self.bytes_total()); + }; + let bytes_total = self.bytes_total(); + &mut self[0..bytes_total] + } +} + +#[cfg(test)] +mod tests { + use std::io::Read; + + use super::*; + use bytes::Buf; + use tokio_epoll_uring::Slice; + + #[test] + fn test_slice_full_zeroed() { + let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader(); + + // before we start the test, let's make sure we have a shared understanding of what slice_full does + { + let buf = Vec::with_capacity(3); + let slice: Slice<_> = buf.slice_full(); + assert_eq!(slice.bytes_init(), 0); + assert_eq!(slice.bytes_total(), 3); + let rust_slice = &slice[..]; + assert_eq!( + rust_slice.len(), + 0, + "Slice only derefs to a &[u8] of the initialized part" + ); + } + + // and also let's establish a shared understanding of .slice() + { + let buf = Vec::with_capacity(3); + let slice: Slice<_> = buf.slice(0..2); + assert_eq!(slice.bytes_init(), 0); + assert_eq!(slice.bytes_total(), 2); + let rust_slice = &slice[..]; + assert_eq!( + rust_slice.len(), + 0, + "Slice only derefs to a &[u8] of the initialized part" + ); + } + + // the above leads to the easy mistake of using slice[..] for borrow-based IO like so: + { + let buf = Vec::with_capacity(3); + let mut slice: Slice<_> = buf.slice_full(); + assert_eq!(slice[..].len(), 0); + let mut file = make_fake_file(); + file.read_exact(&mut slice[..]).unwrap(); // one might think this reads 3 bytes but it reads 0 + assert_eq!(&slice[..] as &[u8], &[][..] as &[u8]); + } + + // With owned buffers IO like with VirtualFilem, you could totally + // pass in a `Slice` with bytes_init()=0 but bytes_total()=5 + // and it will read 5 bytes into the slice, and return a slice that has bytes_init()=5. + { + // TODO: demo + } + + // + // Ok, now that we have a shared understanding let's demo how to use the extension trait. + // + + // slice_full() + { + let buf = Vec::with_capacity(3); + let mut slice: Slice<_> = buf.slice_full(); + let rust_slice = slice.as_mut_rust_slice_full_zeroed(); + assert_eq!(rust_slice.len(), 3); + assert_eq!(rust_slice, &[0, 0, 0]); + let mut file = make_fake_file(); + file.read_exact(rust_slice).unwrap(); + assert_eq!(rust_slice, b"123"); + assert_eq!(&slice[..], b"123"); + } + + // .slice(..) + { + let buf = Vec::with_capacity(3); + let mut slice: Slice<_> = buf.slice(0..2); + let rust_slice = slice.as_mut_rust_slice_full_zeroed(); + assert_eq!(rust_slice.len(), 2); + assert_eq!(rust_slice, &[0, 0]); + let mut file = make_fake_file(); + file.read_exact(rust_slice).unwrap(); + assert_eq!(rust_slice, b"12"); + assert_eq!(&slice[..], b"12"); + } + } +} diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 4f26f2f6d1f5..07c90385e654 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -343,7 +343,33 @@ impl WalIngest { xlog_checkpoint.oldestActiveXid, self.checkpoint.oldestActiveXid ); - self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid; + + // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`, + // because at shutdown, all in-progress transactions will implicitly + // end. Postgres startup code knows that, and allows hot standby to start + // immediately from a shutdown checkpoint. + // + // In Neon, Postgres hot standby startup always behaves as if starting from + // an online checkpoint. It needs a valid `oldestActiveXid` value, so + // instead of overwriting self.checkpoint.oldestActiveXid with + // InvalidTransactionid from the checkpoint WAL record, update it to a + // proper value, knowing that there are no in-progress transactions at this + // point, except for prepared transactions. + // + // See also the neon code changes in the InitWalRecovery() function. + if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID + && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN + { + let mut oldest_active_xid = self.checkpoint.nextXid.value as u32; + for xid in modification.tline.list_twophase_files(lsn, ctx).await? { + if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 { + oldest_active_xid = xid; + } + } + self.checkpoint.oldestActiveXid = oldest_active_xid; + } else { + self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid; + } // Write a new checkpoint key-value pair on every checkpoint record, even // if nothing really changed. Not strictly required, but it seems nice to @@ -375,6 +401,7 @@ impl WalIngest { if info == pg_constants::XLOG_RUNNING_XACTS { let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf); self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid; + self.checkpoint_modified = true; } } pg_constants::RM_REPLORIGIN_ID => { @@ -1277,13 +1304,10 @@ impl WalIngest { xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db ); - // Here we treat oldestXid and oldestXidDB - // differently from postgres redo routines. - // In postgres checkpoint.oldestXid lags behind xlrec.oldest_xid - // until checkpoint happens and updates the value. - // Here we can use the most recent value. - // It's just an optimization, though and can be deleted. - // TODO Figure out if there will be any issues with replica. + // In Postgres, oldestXid and oldestXidDB are updated in memory when the CLOG is + // truncated, but a checkpoint record with the updated values isn't written until + // later. In Neon, a server can start at any LSN, not just on a checkpoint record, + // so we keep the oldestXid and oldestXidDB up-to-date. self.checkpoint.oldestXid = xlrec.oldest_xid; self.checkpoint.oldestXidDB = xlrec.oldest_xid_db; self.checkpoint_modified = true; @@ -1384,14 +1408,31 @@ impl WalIngest { // Note: The multixact members can wrap around, even within one WAL record. offset = offset.wrapping_add(n_this_page as u32); } - if xlrec.mid >= self.checkpoint.nextMulti { - self.checkpoint.nextMulti = xlrec.mid + 1; - self.checkpoint_modified = true; - } - if xlrec.moff + xlrec.nmembers > self.checkpoint.nextMultiOffset { - self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers; + let next_offset = offset; + assert!(xlrec.moff.wrapping_add(xlrec.nmembers) == next_offset); + + // Update next-multi-xid and next-offset + // + // NB: In PostgreSQL, the next-multi-xid stored in the control file is allowed to + // go to 0, and it's fixed up by skipping to FirstMultiXactId in functions that + // read it, like GetNewMultiXactId(). This is different from how nextXid is + // incremented! nextXid skips over < FirstNormalTransactionId when the the value + // is stored, so it's never 0 in a checkpoint. + // + // I don't know why it's done that way, it seems less error-prone to skip over 0 + // when the value is stored rather than when it's read. But let's do it the same + // way here. + let next_multi_xid = xlrec.mid.wrapping_add(1); + + if self + .checkpoint + .update_next_multixid(next_multi_xid, next_offset) + { self.checkpoint_modified = true; } + + // Also update the next-xid with the highest member. According to the comments in + // multixact_redo(), this shouldn't be necessary, but let's do the same here. let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| { if let Some(max_xid) = acc { if mbr.xid.wrapping_sub(max_xid) as i32 > 0 { diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs index d562540bde9b..5095beefd755 100644 --- a/pageserver/src/walredo.rs +++ b/pageserver/src/walredo.rs @@ -40,6 +40,7 @@ use std::time::Duration; use std::time::Instant; use tracing::*; use utils::lsn::Lsn; +use utils::sync::gate::GateError; use utils::sync::heavier_once_cell; /// @@ -53,10 +54,18 @@ pub struct PostgresRedoManager { tenant_shard_id: TenantShardId, conf: &'static PageServerConf, last_redo_at: std::sync::Mutex>, - /// The current [`process::WalRedoProcess`] that is used by new redo requests. - /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo - /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the + /// We use [`heavier_once_cell`] for + /// + /// 1. coalescing the lazy spawning of walredo processes ([`ProcessOnceCell::Spawned`]) + /// 2. prevent new processes from being spawned on [`Self::shutdown`] (=> [`ProcessOnceCell::ManagerShutDown`]). + /// + /// # Spawning + /// + /// Redo requests use the once cell to coalesce onto one call to [`process::WalRedoProcess::launch`]. + /// + /// Notably, requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the /// their process object; we use [`Arc::clone`] for that. + /// /// This is primarily because earlier implementations that didn't use [`heavier_once_cell`] /// had that behavior; it's probably unnecessary. /// The only merit of it is that if one walredo process encounters an error, @@ -65,7 +74,63 @@ pub struct PostgresRedoManager { /// still be using the old redo process. But, those other tasks will most likely /// encounter an error as well, and errors are an unexpected condition anyway. /// So, probably we could get rid of the `Arc` in the future. - redo_process: heavier_once_cell::OnceCell>, + /// + /// # Shutdown + /// + /// See [`Self::launched_processes`]. + redo_process: heavier_once_cell::OnceCell, + + /// Gate that is entered when launching a walredo process and held open + /// until the process has been `kill()`ed and `wait()`ed upon. + /// + /// Manager shutdown waits for this gate to close after setting the + /// [`ProcessOnceCell::ManagerShutDown`] state in [`Self::redo_process`]. + /// + /// This type of usage is a bit unusual because gates usually keep track of + /// concurrent operations, e.g., every [`Self::request_redo`] that is inflight. + /// But we use it here to keep track of the _processes_ that we have launched, + /// which may outlive any individual redo request because + /// - we keep walredo process around until its quiesced to amortize spawn cost and + /// - the Arc may be held by multiple concurrent redo requests, so, just because + /// you replace the [`Self::redo_process`] cell's content doesn't mean the + /// process gets killed immediately. + /// + /// We could simplify this by getting rid of the [`Arc`]. + /// See the comment on [`Self::redo_process`] for more details. + launched_processes: utils::sync::gate::Gate, +} + +/// See [`PostgresRedoManager::redo_process`]. +enum ProcessOnceCell { + Spawned(Arc), + ManagerShutDown, +} + +struct Process { + _launched_processes_guard: utils::sync::gate::GateGuard, + process: process::WalRedoProcess, +} + +impl std::ops::Deref for Process { + type Target = process::WalRedoProcess; + + fn deref(&self) -> &Self::Target { + &self.process + } +} + +#[derive(Debug, thiserror::Error)] +pub enum Error { + #[error("cancelled")] + Cancelled, + #[error(transparent)] + Other(#[from] anyhow::Error), +} + +macro_rules! bail { + ($($arg:tt)*) => { + return Err($crate::walredo::Error::Other(::anyhow::anyhow!($($arg)*))); + } } /// @@ -88,9 +153,9 @@ impl PostgresRedoManager { base_img: Option<(Lsn, Bytes)>, records: Vec<(Lsn, NeonWalRecord)>, pg_version: u32, - ) -> anyhow::Result { + ) -> Result { if records.is_empty() { - anyhow::bail!("invalid WAL redo request with no records"); + bail!("invalid WAL redo request with no records"); } let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID); @@ -148,10 +213,10 @@ impl PostgresRedoManager { chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?) }) }, - process: self - .redo_process - .get() - .map(|p| WalRedoManagerProcessStatus { pid: p.id() }), + process: self.redo_process.get().and_then(|p| match &*p { + ProcessOnceCell::Spawned(p) => Some(WalRedoManagerProcessStatus { pid: p.id() }), + ProcessOnceCell::ManagerShutDown => None, + }), } } } @@ -170,9 +235,39 @@ impl PostgresRedoManager { conf, last_redo_at: std::sync::Mutex::default(), redo_process: heavier_once_cell::OnceCell::default(), + launched_processes: utils::sync::gate::Gate::default(), } } + /// Shut down the WAL redo manager. + /// + /// After this future completes + /// - no redo process is running + /// - no new redo process will be spawned + /// - redo requests that need walredo process will fail with [`Error::Cancelled`] + /// - [`apply_neon`]-only redo requests may still work, but this may change in the future + /// + /// # Cancel-Safety + /// + /// This method is cancellation-safe. + pub async fn shutdown(&self) { + // prevent new processes from being spawned + let permit = match self.redo_process.get_or_init_detached().await { + Ok(guard) => { + let (proc, permit) = guard.take_and_deinit(); + drop(proc); // this just drops the Arc, its refcount may not be zero yet + permit + } + Err(permit) => permit, + }; + self.redo_process + .set(ProcessOnceCell::ManagerShutDown, permit); + // wait for ongoing requests to drain and the refcounts of all Arc that + // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s + // for the underlying process. + self.launched_processes.close().await; + } + /// This type doesn't have its own background task to check for idleness: we /// rely on our owner calling this function periodically in its own housekeeping /// loops. @@ -203,38 +298,48 @@ impl PostgresRedoManager { records: &[(Lsn, NeonWalRecord)], wal_redo_timeout: Duration, pg_version: u32, - ) -> anyhow::Result { + ) -> Result { *(self.last_redo_at.lock().unwrap()) = Some(Instant::now()); let (rel, blknum) = key.to_rel_block().context("invalid record")?; const MAX_RETRY_ATTEMPTS: u32 = 1; let mut n_attempts = 0u32; loop { - let proc: Arc = - match self.redo_process.get_or_init_detached().await { - Ok(guard) => Arc::clone(&guard), - Err(permit) => { - // don't hold poison_guard, the launch code can bail - let start = Instant::now(); - let proc = Arc::new( - process::WalRedoProcess::launch( + let proc: Arc = match self.redo_process.get_or_init_detached().await { + Ok(guard) => match &*guard { + ProcessOnceCell::Spawned(proc) => Arc::clone(proc), + ProcessOnceCell::ManagerShutDown => { + return Err(Error::Cancelled); + } + }, + Err(permit) => { + let start = Instant::now(); + let proc = Arc::new(Process { + _launched_processes_guard: match self.launched_processes.enter() { + Ok(guard) => guard, + Err(GateError::GateClosed) => unreachable!( + "shutdown sets the once cell to `ManagerShutDown` state before closing the gate" + ), + }, + process: process::WalRedoProcess::launch( self.conf, self.tenant_shard_id, pg_version, ) .context("launch walredo process")?, - ); - let duration = start.elapsed(); - WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); - info!( - duration_ms = duration.as_millis(), - pid = proc.id(), - "launched walredo process" - ); - self.redo_process.set(Arc::clone(&proc), permit); - proc - } - }; + }); + let duration = start.elapsed(); + WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64()); + info!( + duration_ms = duration.as_millis(), + pid = proc.id(), + "launched walredo process" + ); + self.redo_process + .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit); + proc + } + }; let started_at = std::time::Instant::now(); @@ -299,12 +404,17 @@ impl PostgresRedoManager { match self.redo_process.get() { None => (), Some(guard) => { - if Arc::ptr_eq(&proc, &*guard) { - // We're the first to observe an error from `proc`, it's our job to take it out of rotation. - guard.take_and_deinit(); - } else { - // Another task already spawned another redo process (further up in this method) - // and put it into `redo_process`. Do nothing, our view of the world is behind. + match &*guard { + ProcessOnceCell::ManagerShutDown => {} + ProcessOnceCell::Spawned(guard_proc) => { + if Arc::ptr_eq(&proc, guard_proc) { + // We're the first to observe an error from `proc`, it's our job to take it out of rotation. + guard.take_and_deinit(); + } else { + // Another task already spawned another redo process (further up in this method) + // and put it into `redo_process`. Do nothing, our view of the world is behind. + } + } } } } @@ -315,7 +425,7 @@ impl PostgresRedoManager { } n_attempts += 1; if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() { - return result; + return result.map_err(Error::Other); } } } @@ -329,7 +439,7 @@ impl PostgresRedoManager { lsn: Lsn, base_img: Option, records: &[(Lsn, NeonWalRecord)], - ) -> anyhow::Result { + ) -> Result { let start_time = Instant::now(); let mut page = BytesMut::new(); @@ -338,7 +448,7 @@ impl PostgresRedoManager { page.extend_from_slice(&fpi[..]); } else { // All the current WAL record types that we can handle require a base image. - anyhow::bail!("invalid neon WAL redo request with no base image"); + bail!("invalid neon WAL redo request with no base image"); } // Apply all the WAL records in the batch diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index cd316dbb9141..3b755bb0420c 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -6,6 +6,7 @@ OBJS = \ $(WIN32RES) \ extension_server.o \ file_cache.o \ + hll.o \ libpagestore.o \ neon.o \ neon_utils.o \ @@ -22,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq) SHLIB_LINK = -lcurl EXTENSION = neon -DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql +DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql neon--1.3--1.4.sql neon--1.4--1.3.sql PGFILEDESC = "neon - cloud storage for PostgreSQL" EXTRA_CLEAN = \ diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 25275ef31fe9..1894e8c72a5c 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -26,7 +26,6 @@ #include "miscadmin.h" #include "pagestore_client.h" #include "common/hashfn.h" -#include "lib/hyperloglog.h" #include "pgstat.h" #include "postmaster/bgworker.h" #include RELFILEINFO_HDR @@ -40,6 +39,8 @@ #include "utils/dynahash.h" #include "utils/guc.h" +#include "hll.h" + /* * Local file cache is used to temporary store relations pages in local file system. * All blocks of all relations are stored inside one file and addressed using shared hash map. @@ -62,7 +63,6 @@ #define BLOCKS_PER_CHUNK 128 /* 1Mb chunk */ #define MB ((uint64)1024*1024) -#define HYPER_LOG_LOG_BIT_WIDTH 10 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK)) typedef struct FileCacheEntry @@ -87,8 +87,7 @@ typedef struct FileCacheControl uint64 writes; dlist_head lru; /* double linked list for LRU replacement * algorithm */ - hyperLogLogState wss_estimation; /* estimation of wroking set size */ - uint8_t hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1]; + HyperLogLogState wss_estimation; /* estimation of working set size */ } FileCacheControl; static HTAB *lfc_hash; @@ -238,12 +237,7 @@ lfc_shmem_startup(void) dlist_init(&lfc_ctl->lru); /* Initialize hyper-log-log structure for estimating working set size */ - initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH); - - /* We need hashes in shared memory */ - pfree(lfc_ctl->wss_estimation.hashesArr); - memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); - lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes; + initSHLL(&lfc_ctl->wss_estimation); /* Recreate file cache on restart */ fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC); @@ -545,7 +539,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, /* Approximate working set */ tag.blockNum = blkno; - addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); + addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag))); if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0) { @@ -986,20 +980,38 @@ local_cache_pages(PG_FUNCTION_ARGS) SRF_RETURN_DONE(funcctx); } +PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds); + +Datum +approximate_working_set_size_seconds(PG_FUNCTION_ARGS) +{ + if (lfc_size_limit != 0) + { + int32 dc; + time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0); + LWLockAcquire(lfc_lock, LW_SHARED); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration); + LWLockRelease(lfc_lock); + PG_RETURN_INT32(dc); + } + PG_RETURN_NULL(); +} + PG_FUNCTION_INFO_V1(approximate_working_set_size); Datum approximate_working_set_size(PG_FUNCTION_ARGS) { - int32 dc = -1; if (lfc_size_limit != 0) { + int32 dc; bool reset = PG_GETARG_BOOL(0); LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED); - dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation); + dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1); if (reset) - memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes); + memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs); LWLockRelease(lfc_lock); + PG_RETURN_INT32(dc); } - PG_RETURN_INT32(dc); + PG_RETURN_NULL(); } diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c new file mode 100644 index 000000000000..f8496b31259d --- /dev/null +++ b/pgxn/neon/hll.c @@ -0,0 +1,193 @@ +/*------------------------------------------------------------------------- + * + * hll.c + * Sliding HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group + * + * Implements https://hal.science/hal-00465313/document + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include + +#include "postgres.h" +#include "funcapi.h" +#include "port/pg_bitutils.h" +#include "utils/timestamp.h" +#include "hll.h" + + +#define POW_2_32 (4294967296.0) +#define NEG_POW_2_32 (-4294967296.0) + +#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS) + +/* + * Worker for addHyperLogLog(). + * + * Calculates the position of the first set bit in first b bits of x argument + * starting from the first, reading from most significant to least significant + * bits. + * + * Example (when considering fist 10 bits of x): + * + * rho(x = 0b1000000000) returns 1 + * rho(x = 0b0010000000) returns 3 + * rho(x = 0b0000000000) returns b + 1 + * + * "The binary address determined by the first b bits of x" + * + * Return value "j" used to index bit pattern to watch. + */ +static inline uint8 +rho(uint32 x, uint8 b) +{ + uint8 j = 1; + + if (x == 0) + return b + 1; + + j = 32 - pg_leftmost_one_pos32(x); + + if (j > b) + return b + 1; + + return j; +} + +/* + * Initialize HyperLogLog track state + */ +void +initSHLL(HyperLogLogState *cState) +{ + memset(cState->regs, 0, sizeof(cState->regs)); +} + +/* + * Adds element to the estimator, from caller-supplied hash. + * + * It is critical that the hash value passed be an actual hash value, typically + * generated using hash_any(). The algorithm relies on a specific bit-pattern + * observable in conjunction with stochastic averaging. There must be a + * uniform distribution of bits in hash values for each distinct original value + * observed. + */ +void +addSHLL(HyperLogLogState *cState, uint32 hash) +{ + uint8 count; + uint32 index; + size_t i; + size_t j; + + TimestampTz now = GetCurrentTimestamp(); + /* Use the first "k" (registerWidth) bits as a zero based index */ + index = hash >> HLL_C_BITS; + + /* Compute the rank of the remaining 32 - "k" (registerWidth) bits */ + count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS); + + cState->regs[index][count] = now; +} + +static uint8 +getMaximum(const TimestampTz* reg, TimestampTz since) +{ + uint8 max = 0; + + for (size_t i = 0; i < HLL_C_BITS + 1; i++) + { + if (reg[i] >= since) + { + max = i; + } + } + + return max; +} + + +/* + * Estimates cardinality, based on elements added so far + */ +double +estimateSHLL(HyperLogLogState *cState, time_t duration) +{ + double result; + double sum = 0.0; + size_t i; + uint8 R[HLL_N_REGISTERS]; + /* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */ + TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC; + + for (i = 0; i < HLL_N_REGISTERS; i++) + { + R[i] = getMaximum(cState->regs[i], since); + sum += 1.0 / pow(2.0, R[i]); + } + + /* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */ + result = ALPHA_MM / sum; + + if (result <= (5.0 / 2.0) * HLL_N_REGISTERS) + { + /* Small range correction */ + int zero_count = 0; + + for (i = 0; i < HLL_N_REGISTERS; i++) + { + zero_count += R[i] == 0; + } + + if (zero_count != 0) + result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS / + zero_count); + } + else if (result > (1.0 / 30.0) * POW_2_32) + { + /* Large range correction */ + result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32)); + } + + return result; +} + diff --git a/pgxn/neon/hll.h b/pgxn/neon/hll.h new file mode 100644 index 000000000000..9256cb9afa2f --- /dev/null +++ b/pgxn/neon/hll.h @@ -0,0 +1,86 @@ +/*------------------------------------------------------------------------- + * + * hll.h + * Sliding HyperLogLog cardinality estimator + * + * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group + * + * Implements https://hal.science/hal-00465313/document + * + * Based on Hideaki Ohno's C++ implementation. This is probably not ideally + * suited to estimating the cardinality of very large sets; in particular, we + * have not attempted to further optimize the implementation as described in + * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic + * Engineering of a State of The Art Cardinality Estimation Algorithm". + * + * A sparse representation of HyperLogLog state is used, with fixed space + * overhead. + * + * The copyright terms of Ohno's original version (the MIT license) follow. + * + * IDENTIFICATION + * src/backend/lib/hyperloglog.c + * + *------------------------------------------------------------------------- + */ + +/* + * Copyright (c) 2013 Hideaki Ohno + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the 'Software'), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef HLL_H +#define HLL_H + +#define HLL_BIT_WIDTH 10 +#define HLL_C_BITS (32 - HLL_BIT_WIDTH) +#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH) + +/* + * HyperLogLog is an approximate technique for computing the number of distinct + * entries in a set. Importantly, it does this by using a fixed amount of + * memory. See the 2007 paper "HyperLogLog: the analysis of a near-optimal + * cardinality estimation algorithm" for more. + * + * Instead of a single counter for every bits register, we have a timestamp + * for every valid number of bits we can encounter. Every time we encounter + * a certain number of bits, we update the timestamp in those registers to + * the current timestamp. + * + * We can query the sketch's stored cardinality for the range of some timestamp + * up to now: For each register, we return the highest bits bucket that has a + * modified timestamp >= the query timestamp. This value is the number of bits + * for this register in the normal HLL calculation. + * + * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB. + * Usage could be halved if we decide to reduce the required time dimension + * precision; as 32 bits in second precision should be enough for statistics. + * However, that is not yet implemented. + */ +typedef struct HyperLogLogState +{ + TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1]; +} HyperLogLogState; + +extern void initSHLL(HyperLogLogState *cState); +extern void addSHLL(HyperLogLogState *cState, uint32 hash); +extern double estimateSHLL(HyperLogLogState *cState, time_t dutration); + +#endif diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index a665cafafe71..73a001b6ba72 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -427,12 +427,17 @@ pageserver_connect(shardno_t shard_no, int elevel) values[n_pgsql_params] = NULL; shard->conn = PQconnectStartParams(keywords, values, 1); - if (!shard->conn) + if (PQstatus(shard->conn) == CONNECTION_BAD) { - neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory"); + char *msg = pchomp(PQerrorMessage(shard->conn)); + CLEANUP_AND_DISCONNECT(shard); + ereport(elevel, + (errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION), + errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no), + errdetail_internal("%s", msg))); + pfree(msg); return false; } - shard->state = PS_Connecting_Startup; /* fallthrough */ } diff --git a/pgxn/neon/neon--1.3--1.4.sql b/pgxn/neon/neon--1.3--1.4.sql new file mode 100644 index 000000000000..042effe3461c --- /dev/null +++ b/pgxn/neon/neon--1.3--1.4.sql @@ -0,0 +1,9 @@ +\echo Use "ALTER EXTENSION neon UPDATE TO '1.4'" to load this file. \quit + +CREATE FUNCTION approximate_working_set_size_seconds(duration integer default null) +RETURNS integer +AS 'MODULE_PATHNAME', 'approximate_working_set_size_seconds' +LANGUAGE C PARALLEL SAFE; + +GRANT EXECUTE ON FUNCTION approximate_working_set_size_seconds(integer) TO pg_monitor; + diff --git a/pgxn/neon/neon--1.4--1.3.sql b/pgxn/neon/neon--1.4--1.3.sql new file mode 100644 index 000000000000..bea72d1a6b17 --- /dev/null +++ b/pgxn/neon/neon--1.4--1.3.sql @@ -0,0 +1 @@ +DROP FUNCTION IF EXISTS approximate_working_set_size_seconds(integer) CASCADE; diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index b6b2db7e71ad..e4968bdf8991 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -12,6 +12,8 @@ #include "fmgr.h" #include "miscadmin.h" +#include "access/subtrans.h" +#include "access/twophase.h" #include "access/xact.h" #include "access/xlog.h" #include "storage/buf_internals.h" @@ -22,10 +24,12 @@ #include "replication/logical.h" #include "replication/slot.h" #include "replication/walsender.h" +#include "storage/proc.h" #include "storage/procsignal.h" #include "tcop/tcopprot.h" #include "funcapi.h" #include "access/htup_details.h" +#include "utils/builtins.h" #include "utils/pg_lsn.h" #include "utils/guc.h" #include "utils/wait_event.h" @@ -266,6 +270,293 @@ LogicalSlotsMonitorMain(Datum main_arg) } } +/* + * XXX: These private to procarray.c, but we need them here. + */ +#define PROCARRAY_MAXPROCS (MaxBackends + max_prepared_xacts) +#define TOTAL_MAX_CACHED_SUBXIDS \ + ((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS) + +/* + * Restore running-xact information by scanning the CLOG at startup. + * + * In PostgreSQL, a standby always has to wait for a running-xacts WAL record + * to arrive before it can start accepting queries. Furthermore, if there are + * transactions with too many subxids (> 64) open to fit in the in-memory + * subxids cache, the running-xacts record will be marked as "suboverflowed", + * and the standby will need to also wait for the currently in-progress + * transactions to finish. + * + * That's not great in PostgreSQL, because a hot standby does not necessary + * open up for queries immediately as you might expect. But it's worse in + * Neon: A standby in Neon doesn't need to start WAL replay from a checkpoint + * record; it can start at any LSN. Postgres arranges things so that there is + * a running-xacts record soon after every checkpoint record, but when you + * start from an arbitrary LSN, that doesn't help. If the primary is idle, or + * not running at all, it might never write a new running-xacts record, + * leaving the replica in a limbo where it can never start accepting queries. + * + * To mitigate that, we have an additional mechanism to find the running-xacts + * information: we scan the CLOG, making note of any XIDs not marked as + * committed or aborted. They are added to the Postgres known-assigned XIDs + * array by calling ProcArrayApplyRecoveryInfo() in the caller of this + * function. + * + * There is one big limitation with that mechanism: The size of the + * known-assigned XIDs is limited, so if there are a lot of in-progress XIDs, + * we have to give up. Furthermore, we don't know how many of the in-progress + * XIDs are subtransactions, and if we use up all the space in the + * known-assigned XIDs array for subtransactions, we might run out of space in + * the array later during WAL replay, causing the replica to shut down with + * "ERROR: too many KnownAssignedXids". The safe # of XIDs that we can add to + * the known-assigned array without risking that error later is very low, + * merely PGPROC_MAX_CACHED_SUBXIDS == 64, so we take our chances and use up + * to half of the known-assigned XIDs array for the subtransactions, even + * though that risks getting the error later. + * + * Note: It's OK if the recovered list of XIDs includes some transactions that + * have crashed in the primary, and hence will never commit. They will be seen + * as in-progress, until we see a new next running-acts record with an + * oldestActiveXid that invalidates them. That's how the known-assigned XIDs + * array always works. + * + * If scraping the CLOG doesn't succeed for some reason, like the subxid + * overflow, Postgres will fall back to waiting for a running-xacts record + * like usual. + * + * Returns true if a complete list of in-progress XIDs was scraped. + */ +static bool +RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *nxids) +{ + TransactionId from; + TransactionId till; + int max_xcnt; + TransactionId *prepared_xids = NULL; + int n_prepared_xids; + TransactionId *restored_xids = NULL; + int n_restored_xids; + int next_prepared_idx; + + Assert(*xids == NULL); + + /* + * If the checkpoint doesn't have a valid oldestActiveXid, bail out. We + * don't know where to start the scan. + * + * This shouldn't happen, because the pageserver always maintains a valid + * oldestActiveXid nowadays. Except when starting at an old point in time + * that was ingested before the pageserver was taught to do that. + */ + if (!TransactionIdIsValid(checkpoint->oldestActiveXid)) + { + elog(LOG, "cannot restore running-xacts from CLOG because oldestActiveXid is not set"); + goto fail; + } + + /* + * We will scan the CLOG starting from the oldest active XID. + * + * In some corner cases, the oldestActiveXid from the last checkpoint + * might already have been truncated from the CLOG. That is, + * oldestActiveXid might be older than oldestXid. That's possible because + * oldestActiveXid is only updated at checkpoints. After the last + * checkpoint, the oldest transaction might have committed, and the CLOG + * might also have been already truncated. So if oldestActiveXid is older + * than oldestXid, start at oldestXid instead. (Otherwise we'd try to + * access CLOG segments that have already been truncated away.) + */ + from = TransactionIdPrecedes(checkpoint->oldestXid, checkpoint->oldestActiveXid) + ? checkpoint->oldestActiveXid : checkpoint->oldestXid; + till = XidFromFullTransactionId(checkpoint->nextXid); + + /* + * To avoid "too many KnownAssignedXids" error later during replay, we + * limit number of collected transactions. This is a tradeoff: if we are + * willing to consume more of the KnownAssignedXids space for the XIDs + * now, that allows us to start up, but we might run out of space later. + * + * The size of the KnownAssignedXids array is TOTAL_MAX_CACHED_SUBXIDS, + * which is (PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS). In + * PostgreSQL, that's always enough because the primary will always write + * an XLOG_XACT_ASSIGNMENT record if a transaction has more than + * PGPROC_MAX_CACHED_SUBXIDS subtransactions. Seeing that record allows + * the standby to mark the XIDs in pg_subtrans and removing them from the + * KnowingAssignedXids array. + * + * Here, we don't know which XIDs belong to subtransactions that have + * already been WAL-logged with an XLOG_XACT_ASSIGNMENT record. If we + * wanted to be totally safe and avoid the possibility of getting a "too + * many KnownAssignedXids" error later, we would have to limit ourselves + * to PGPROC_MAX_CACHED_SUBXIDS, which is not much. And that includes top + * transaction IDs too, because we cannot distinguish between top + * transaction IDs and subtransactions here. + * + * Somewhat arbitrarily, we use up to half of KnownAssignedXids. That + * strikes a sensible balance between being useful, and risking a "too + * many KnownAssignedXids" error later. + */ + max_xcnt = TOTAL_MAX_CACHED_SUBXIDS / 2; + + /* + * Collect XIDs of prepared transactions in an array. This includes only + * their top-level XIDs. We assume that StandbyRecoverPreparedTransactions + * has already been called, so we can find all the sub-transactions in + * pg_subtrans. + */ + PrescanPreparedTransactions(&prepared_xids, &n_prepared_xids); + qsort(prepared_xids, n_prepared_xids, sizeof(TransactionId), xidLogicalComparator); + + /* + * Scan the CLOG, collecting in-progress XIDs into 'restored_xids'. + */ + elog(DEBUG1, "scanning CLOG between %u and %u for in-progress XIDs", from, till); + restored_xids = (TransactionId *) palloc(max_xcnt * sizeof(TransactionId)); + n_restored_xids = 0; + next_prepared_idx = 0; + for (TransactionId xid = from; xid != till;) + { + XLogRecPtr xidlsn; + XidStatus xidstatus; + + xidstatus = TransactionIdGetStatus(xid, &xidlsn); + + /* + * "Merge" the prepared transactions into the restored_xids array as + * we go. The prepared transactions array is sorted. This is mostly + * a sanity check to ensure that all the prpeared transactions are + * seen as in-progress. (There is a check after the loop that we didn't + * miss any.) + */ + if (next_prepared_idx < n_prepared_xids && xid == prepared_xids[next_prepared_idx]) + { + /* + * This is a top-level transaction ID of a prepared transaction. + * Include it in the array. + */ + + /* sanity check */ + if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS) + { + elog(LOG, "prepared transaction %u has unexpected status %X, cannot restore running-xacts from CLOG", + xid, xidstatus); + Assert(false); + goto fail; + } + + elog(DEBUG1, "XID %u: was next prepared xact (%d / %d)", xid, next_prepared_idx, n_prepared_xids); + next_prepared_idx++; + } + else if (xidstatus == TRANSACTION_STATUS_COMMITTED) + { + elog(DEBUG1, "XID %u: was committed", xid); + goto skip; + } + else if (xidstatus == TRANSACTION_STATUS_ABORTED) + { + elog(DEBUG1, "XID %u: was aborted", xid); + goto skip; + } + else if (xidstatus == TRANSACTION_STATUS_IN_PROGRESS) + { + /* + * In-progress transactions are included in the array. + * + * Except subtransactions of the prepared transactions. They are + * already set in pg_subtrans, and hence don't need to be tracked + * in the known-assigned XIDs array. + */ + if (n_prepared_xids > 0) + { + TransactionId parent = SubTransGetParent(xid); + + if (TransactionIdIsValid(parent)) + { + /* + * This is a subtransaction belonging to a prepared + * transaction. + * + * Sanity check that it is in the prepared XIDs array. It + * should be, because StandbyRecoverPreparedTransactions + * populated pg_subtrans, and no other XID should be set + * in it yet. (This also relies on the fact that + * StandbyRecoverPreparedTransactions sets the parent of + * each subxid to point directly to the top-level XID, + * rather than restoring the original subtransaction + * hierarchy.) + */ + if (bsearch(&parent, prepared_xids, next_prepared_idx, + sizeof(TransactionId), xidLogicalComparator) == NULL) + { + elog(LOG, "sub-XID %u has unexpected parent %u, cannot restore running-xacts from CLOG", + xid, parent); + Assert(false); + goto fail; + } + elog(DEBUG1, "XID %u: was a subtransaction of prepared xid %u", xid, parent); + goto skip; + } + } + + /* include it in the array */ + elog(DEBUG1, "XID %u: is in progress", xid); + } + else + { + /* + * SUB_COMMITTED is a transient state used at commit. We don't + * expect to see that here. + */ + elog(LOG, "XID %u has unexpected status %X in pg_xact, cannot restore running-xacts from CLOG", + xid, xidstatus); + Assert(false); + goto fail; + } + + if (n_restored_xids >= max_xcnt) + { + /* + * Overflowed. We won't be able to install the RunningTransactions + * snapshot. + */ + elog(LOG, "too many running xacts to restore from the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u", + checkpoint->oldestXid, checkpoint->oldestActiveXid, + XidFromFullTransactionId(checkpoint->nextXid)); + goto fail; + } + + restored_xids[n_restored_xids++] = xid; + + skip: + TransactionIdAdvance(xid); + continue; + } + + /* sanity check */ + if (next_prepared_idx != n_prepared_xids) + { + elog(LOG, "prepared transaction ID %u was not visited in the CLOG scan, cannot restore running-xacts from CLOG", + prepared_xids[next_prepared_idx]); + Assert(false); + goto fail; + } + + elog(LOG, "restored %d running xacts by scanning the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u", + n_restored_xids, checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid)); + *nxids = n_restored_xids; + *xids = restored_xids; + return true; + + fail: + *nxids = 0; + *xids = NULL; + if (restored_xids) + pfree(restored_xids); + if (prepared_xids) + pfree(prepared_xids); + return false; +} + void _PG_init(void) { @@ -288,6 +579,8 @@ _PG_init(void) pg_init_extension_server(); + restore_running_xacts_callback = RestoreRunningXactsFromClog; + /* * Important: This must happen after other parts of the extension are * loaded, otherwise any settings to GUCs that were set before the diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index da1a6f76f0a2..944b316344dd 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -63,6 +63,8 @@ char *wal_acceptors_list = ""; int wal_acceptor_reconnect_timeout = 1000; int wal_acceptor_connection_timeout = 10000; +/* Set to true in the walproposer bgw. */ +static bool am_walproposer; static WalproposerShmemState *walprop_shared; static WalProposerConfig walprop_config; static XLogRecPtr sentPtr = InvalidXLogRecPtr; @@ -76,6 +78,7 @@ static HotStandbyFeedback agg_hs_feedback; static void nwp_shmem_startup_hook(void); static void nwp_register_gucs(void); +static void assign_neon_safekeepers(const char *newval, void *extra); static void nwp_prepare_shmem(void); static uint64 backpressure_lag_impl(void); static bool backpressure_throttling_impl(void); @@ -111,7 +114,8 @@ init_walprop_config(bool syncSafekeepers) { walprop_config.neon_tenant = neon_tenant; walprop_config.neon_timeline = neon_timeline; - walprop_config.safekeepers_list = wal_acceptors_list; + /* WalProposerCreate scribbles directly on it, so pstrdup */ + walprop_config.safekeepers_list = pstrdup(wal_acceptors_list); walprop_config.safekeeper_reconnect_timeout = wal_acceptor_reconnect_timeout; walprop_config.safekeeper_connection_timeout = wal_acceptor_connection_timeout; walprop_config.wal_segment_size = wal_segment_size; @@ -151,6 +155,7 @@ WalProposerMain(Datum main_arg) init_walprop_config(false); walprop_pg_init_bgworker(); + am_walproposer = true; walprop_pg_load_libpqwalreceiver(); wp = WalProposerCreate(&walprop_config, walprop_pg); @@ -189,10 +194,10 @@ nwp_register_gucs(void) NULL, /* long_desc */ &wal_acceptors_list, /* valueAddr */ "", /* bootValue */ - PGC_POSTMASTER, + PGC_SIGHUP, GUC_LIST_INPUT, /* extensions can't use* * GUC_LIST_QUOTE */ - NULL, NULL, NULL); + NULL, assign_neon_safekeepers, NULL); DefineCustomIntVariable( "neon.safekeeper_reconnect_timeout", @@ -215,6 +220,33 @@ nwp_register_gucs(void) NULL, NULL, NULL); } +/* + * GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if + * the list changed. + */ +static void +assign_neon_safekeepers(const char *newval, void *extra) +{ + if (!am_walproposer) + return; + + if (!newval) { + /* should never happen */ + wpg_log(FATAL, "neon.safekeepers is empty"); + } + + /* + * TODO: restarting through FATAL is stupid and introduces 1s delay before + * next bgw start. We should refactor walproposer to allow graceful exit and + * thus remove this delay. + */ + if (strcmp(wal_acceptors_list, newval) != 0) + { + wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s", + wal_acceptors_list, newval); + } +} + /* Check if we need to suspend inserts because of lagging replication. */ static uint64 backpressure_lag_impl(void) @@ -363,7 +395,7 @@ walprop_register_bgworker(void) snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain"); snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer"); snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer"); - bgw.bgw_restart_time = 5; + bgw.bgw_restart_time = 1; bgw.bgw_notify_pid = 0; bgw.bgw_main_arg = (Datum) 0; @@ -1639,6 +1671,18 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 late_cv_trigger = ConditionVariableCancelSleep(); #endif + /* + * Process config if requested. This restarts walproposer if safekeepers + * list changed. Don't do that for sync-safekeepers because quite probably + * it (re-reading config) won't work without some effort, and + * sync-safekeepers should be quick to finish anyway. + */ + if (!wp->config->syncSafekeepers && ConfigReloadPending) + { + ConfigReloadPending = false; + ProcessConfigFile(PGC_SIGHUP); + } + /* * If wait is terminated by latch set (walsenders' latch is set on each * wal flush). (no need for pm death check due to WL_EXIT_ON_PM_DEATH) diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile index 1ee87357e5e2..252810b5b02e 100644 --- a/pgxn/neon_test_utils/Makefile +++ b/pgxn/neon_test_utils/Makefile @@ -7,7 +7,7 @@ OBJS = \ neontest.o EXTENSION = neon_test_utils -DATA = neon_test_utils--1.1.sql +DATA = neon_test_utils--1.3.sql PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging" PG_CONFIG = pg_config diff --git a/pgxn/neon_test_utils/neon_test_utils--1.1.sql b/pgxn/neon_test_utils/neon_test_utils--1.3.sql similarity index 74% rename from pgxn/neon_test_utils/neon_test_utils--1.1.sql rename to pgxn/neon_test_utils/neon_test_utils--1.3.sql index 534784f31912..3b8794a8cff4 100644 --- a/pgxn/neon_test_utils/neon_test_utils--1.1.sql +++ b/pgxn/neon_test_utils/neon_test_utils--1.3.sql @@ -41,7 +41,25 @@ RETURNS bytea AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex' LANGUAGE C PARALLEL UNSAFE; -CREATE FUNCTION neon_xlogflush(lsn pg_lsn) +CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL) RETURNS VOID AS 'MODULE_PATHNAME', 'neon_xlogflush' LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION trigger_panic() +RETURNS VOID +AS 'MODULE_PATHNAME', 'trigger_panic' +LANGUAGE C PARALLEL UNSAFE; + +CREATE FUNCTION trigger_segfault() +RETURNS VOID +AS 'MODULE_PATHNAME', 'trigger_segfault' +LANGUAGE C PARALLEL UNSAFE; + +-- Alias for `trigger_segfault`, just because `SELECT 💣()` looks fun +CREATE OR REPLACE FUNCTION 💣() RETURNS void +LANGUAGE plpgsql AS $$ +BEGIN + PERFORM trigger_segfault(); +END; +$$; diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control index 5f6d64083591..f22afd70c4fa 100644 --- a/pgxn/neon_test_utils/neon_test_utils.control +++ b/pgxn/neon_test_utils/neon_test_utils.control @@ -1,6 +1,6 @@ # neon_test_utils extension comment = 'helpers for neon testing and debugging' -default_version = '1.1' +default_version = '1.3' module_pathname = '$libdir/neon_test_utils' relocatable = true trusted = true diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c index 47f245fbf1af..650ef7405d64 100644 --- a/pgxn/neon_test_utils/neontest.c +++ b/pgxn/neon_test_utils/neontest.c @@ -15,6 +15,7 @@ #include "access/relation.h" #include "access/xact.h" #include "access/xlog.h" +#include "access/xlog_internal.h" #include "catalog/namespace.h" #include "fmgr.h" #include "funcapi.h" @@ -41,6 +42,8 @@ PG_FUNCTION_INFO_V1(clear_buffer_cache); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn); PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex); PG_FUNCTION_INFO_V1(neon_xlogflush); +PG_FUNCTION_INFO_V1(trigger_panic); +PG_FUNCTION_INFO_V1(trigger_segfault); /* * Linkage to functions in neon module. @@ -444,12 +447,68 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS) /* * Directly calls XLogFlush(lsn) to flush WAL buffers. + * + * If 'lsn' is not specified (is NULL), flush all generated WAL. */ Datum neon_xlogflush(PG_FUNCTION_ARGS) { - XLogRecPtr lsn = PG_GETARG_LSN(0); + XLogRecPtr lsn; + + if (RecoveryInProgress()) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("recovery is in progress"), + errhint("cannot flush WAL during recovery."))); + + if (!PG_ARGISNULL(0)) + lsn = PG_GETARG_LSN(0); + else + { + lsn = GetXLogInsertRecPtr(); + + /*--- + * The LSN returned by GetXLogInsertRecPtr() is the position where the + * next inserted record would begin. If the last record ended just at + * the page boundary, the next record will begin after the page header + * on the next page, but the next page's page header has not been + * written yet. If we tried to flush it, XLogFlush() would throw an + * error: + * + * ERROR : xlog flush request %X/%X is not satisfied --- flushed only to %X/%X + * + * To avoid that, if the insert position points to just after the page + * header, back off to page boundary. + */ + if (lsn % XLOG_BLCKSZ == SizeOfXLogShortPHD && + XLogSegmentOffset(lsn, wal_segment_size) > XLOG_BLCKSZ) + lsn -= SizeOfXLogShortPHD; + else if (lsn % XLOG_BLCKSZ == SizeOfXLogLongPHD && + XLogSegmentOffset(lsn, wal_segment_size) < XLOG_BLCKSZ) + lsn -= SizeOfXLogLongPHD; + } XLogFlush(lsn); PG_RETURN_VOID(); } + +/* + * Function to trigger panic. + */ +Datum +trigger_panic(PG_FUNCTION_ARGS) +{ + elog(PANIC, "neon_test_utils: panic"); + PG_RETURN_VOID(); +} + +/* + * Function to trigger a segfault. + */ +Datum +trigger_segfault(PG_FUNCTION_ARGS) +{ + int *ptr = NULL; + *ptr = 42; + PG_RETURN_VOID(); +} diff --git a/poetry.lock b/poetry.lock index 7740388fb8be..809114141188 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. [[package]] name = "aiohttp" @@ -734,13 +734,13 @@ typing-extensions = ">=4.1.0" [[package]] name = "certifi" -version = "2023.7.22" +version = "2024.7.4" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" files = [ - {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"}, - {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"}, + {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"}, + {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"}, ] [[package]] @@ -3133,18 +3133,18 @@ multidict = ">=4.0" [[package]] name = "zipp" -version = "3.8.1" +version = "3.19.1" description = "Backport of pathlib-compatible object wrapper for zip files" optional = false -python-versions = ">=3.7" +python-versions = ">=3.8" files = [ - {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"}, - {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"}, + {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"}, + {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"}, ] [package.extras] -docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"] -testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"] [[package]] name = "zstandard" diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs index 44e880838e07..d7a3eb9a4d18 100644 --- a/proxy/src/bin/pg_sni_router.rs +++ b/proxy/src/bin/pg_sni_router.rs @@ -216,10 +216,11 @@ async fn ssl_handshake( use pq_proto::FeStartupPacket::*; match msg { - SslRequest => { + SslRequest { direct: false } => { stream .write_message(&pq_proto::BeMessage::EncryptionResponse(true)) .await?; + // Upgrade raw stream into a secure TLS-backed stream. // NOTE: We've consumed `tls`; this fact will be used later. diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index dffebf55800c..7f4cb2c0100c 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -35,6 +35,7 @@ use proxy::usage_metrics; use anyhow::bail; use proxy::config::{self, ProxyConfig}; use proxy::serverless; +use remote_storage::RemoteStorageConfig; use std::net::SocketAddr; use std::pin::pin; use std::sync::Arc; @@ -205,8 +206,8 @@ struct ProxyCliArgs { /// remote storage configuration for backup metric collection /// Encoded as toml (same format as pageservers), eg /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` - #[clap(long, default_value = "{}")] - metric_backup_collection_remote_storage: String, + #[clap(long, value_parser = remote_storage_from_toml)] + metric_backup_collection_remote_storage: Option, /// chunk size for backup metric collection /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression. #[clap(long, default_value = "4194304")] @@ -511,9 +512,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { } let backup_metric_collection_config = config::MetricBackupCollectionConfig { interval: args.metric_backup_collection_interval, - remote_storage_config: remote_storage_from_toml( - &args.metric_backup_collection_remote_storage, - )?, + remote_storage_config: args.metric_backup_collection_remote_storage.clone(), chunk_size: args.metric_backup_collection_chunk_size, }; diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs index bc1c37512bce..4e393fddb2aa 100644 --- a/proxy/src/cache/common.rs +++ b/proxy/src/cache/common.rs @@ -53,6 +53,13 @@ impl Cached { ) } + pub fn map(self, f: impl FnOnce(V) -> U) -> Cached { + Cached { + token: self.token, + value: f(self.value), + } + } + /// Drop this entry from a cache if it's still there. pub fn invalidate(self) -> V { if let Some((cache, info)) = &self.token { diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs index 3b21381bb971..c5c4f6a1ed09 100644 --- a/proxy/src/cache/timed_lru.rs +++ b/proxy/src/cache/timed_lru.rs @@ -65,6 +65,8 @@ impl Cache for TimedLru { struct Entry { created_at: Instant, expires_at: Instant, + ttl: Duration, + update_ttl_on_retrieval: bool, value: T, } @@ -122,7 +124,6 @@ impl TimedLru { Q: Hash + Eq + ?Sized, { let now = Instant::now(); - let deadline = now.checked_add(self.ttl).expect("time overflow"); // Do costly things before taking the lock. let mut cache = self.cache.lock(); @@ -142,7 +143,8 @@ impl TimedLru { let (created_at, expires_at) = (entry.created_at, entry.expires_at); // Update the deadline and the entry's position in the LRU list. - if self.update_ttl_on_retrieval { + let deadline = now.checked_add(raw_entry.get().ttl).expect("time overflow"); + if raw_entry.get().update_ttl_on_retrieval { raw_entry.get_mut().expires_at = deadline; } raw_entry.to_back(); @@ -162,12 +164,27 @@ impl TimedLru { /// existed, return the previous value and its creation timestamp. #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] fn insert_raw(&self, key: K, value: V) -> (Instant, Option) { + self.insert_raw_ttl(key, value, self.ttl, self.update_ttl_on_retrieval) + } + + /// Insert an entry to the cache. If an entry with the same key already + /// existed, return the previous value and its creation timestamp. + #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)] + fn insert_raw_ttl( + &self, + key: K, + value: V, + ttl: Duration, + update: bool, + ) -> (Instant, Option) { let created_at = Instant::now(); - let expires_at = created_at.checked_add(self.ttl).expect("time overflow"); + let expires_at = created_at.checked_add(ttl).expect("time overflow"); let entry = Entry { created_at, expires_at, + ttl, + update_ttl_on_retrieval: update, value, }; @@ -190,6 +207,21 @@ impl TimedLru { } impl TimedLru { + pub fn insert_ttl(&self, key: K, value: V, ttl: Duration) { + self.insert_raw_ttl(key, value, ttl, false); + } + + pub fn insert_unit(&self, key: K, value: V) -> (Option, Cached<&Self, ()>) { + let (created_at, old) = self.insert_raw(key.clone(), value); + + let cached = Cached { + token: Some((self, LookupInfo { created_at, key })), + value: (), + }; + + (old, cached) + } + pub fn insert(&self, key: K, value: V) -> (Option, Cached<&Self>) { let (created_at, old) = self.insert_raw(key.clone(), value.clone()); diff --git a/proxy/src/config.rs b/proxy/src/config.rs index f4707a33aa79..650491976053 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -75,6 +75,9 @@ impl TlsConfig { } } +/// +pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql"; + /// Configure TLS for the main endpoint. pub fn configure_tls( key_path: &str, @@ -111,16 +114,17 @@ pub fn configure_tls( let cert_resolver = Arc::new(cert_resolver); // allow TLS 1.2 to be compatible with older client libraries - let config = rustls::ServerConfig::builder_with_protocol_versions(&[ + let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[ &rustls::version::TLS13, &rustls::version::TLS12, ]) .with_no_client_auth() - .with_cert_resolver(cert_resolver.clone()) - .into(); + .with_cert_resolver(cert_resolver.clone()); + + config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()]; Ok(TlsConfig { - config, + config: Arc::new(config), common_names, cert_resolver, }) @@ -399,15 +403,11 @@ impl FromStr for EndpointCacheConfig { #[derive(Debug)] pub struct MetricBackupCollectionConfig { pub interval: Duration, - pub remote_storage_config: OptRemoteStorageConfig, + pub remote_storage_config: Option, pub chunk_size: usize, } -/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get -/// runtime type errors from the value parser we use. -pub type OptRemoteStorageConfig = Option; - -pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { +pub fn remote_storage_from_toml(s: &str) -> anyhow::Result { RemoteStorageConfig::from_toml(&s.parse()?) } diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs index d28d13ba692b..9abf24ab7ffa 100644 --- a/proxy/src/console/messages.rs +++ b/proxy/src/console/messages.rs @@ -9,7 +9,7 @@ use crate::proxy::retry::CouldRetry; /// Generic error response with human-readable description. /// Note that we can't always present it to user as is. -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct ConsoleError { pub error: Box, #[serde(skip)] @@ -82,41 +82,19 @@ impl CouldRetry for ConsoleError { .details .error_info .map_or(Reason::Unknown, |e| e.reason); - match reason { - // not a transitive error - Reason::RoleProtected => false, - // on retry, it will still not be found - Reason::ResourceNotFound - | Reason::ProjectNotFound - | Reason::EndpointNotFound - | Reason::BranchNotFound => false, - // we were asked to go away - Reason::RateLimitExceeded - | Reason::NonDefaultBranchComputeTimeExceeded - | Reason::ActiveTimeQuotaExceeded - | Reason::ComputeTimeQuotaExceeded - | Reason::WrittenDataQuotaExceeded - | Reason::DataTransferQuotaExceeded - | Reason::LogicalSizeQuotaExceeded => false, - // transitive error. control plane is currently busy - // but might be ready soon - Reason::RunningOperations => true, - Reason::ConcurrencyLimitReached => true, - Reason::LockAlreadyTaken => true, - // unknown error. better not retry it. - Reason::Unknown => false, - } + + reason.can_retry() } } -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct Status { pub code: Box, pub message: Box, pub details: Details, } -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct Details { pub error_info: Option, pub retry_info: Option, @@ -199,6 +177,34 @@ impl Reason { | Reason::BranchNotFound ) } + + pub fn can_retry(&self) -> bool { + match self { + // do not retry role protected errors + // not a transitive error + Reason::RoleProtected => false, + // on retry, it will still not be found + Reason::ResourceNotFound + | Reason::ProjectNotFound + | Reason::EndpointNotFound + | Reason::BranchNotFound => false, + // we were asked to go away + Reason::RateLimitExceeded + | Reason::NonDefaultBranchComputeTimeExceeded + | Reason::ActiveTimeQuotaExceeded + | Reason::ComputeTimeQuotaExceeded + | Reason::WrittenDataQuotaExceeded + | Reason::DataTransferQuotaExceeded + | Reason::LogicalSizeQuotaExceeded => false, + // transitive error. control plane is currently busy + // but might be ready soon + Reason::RunningOperations + | Reason::ConcurrencyLimitReached + | Reason::LockAlreadyTaken => true, + // unknown error. better not retry it. + Reason::Unknown => false, + } + } } #[derive(Copy, Clone, Debug, Deserialize)] @@ -206,7 +212,7 @@ pub struct RetryInfo { pub retry_delay_ms: u64, } -#[derive(Debug, Deserialize)] +#[derive(Debug, Deserialize, Clone)] pub struct UserFacingMessage { pub message: Box, } diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs index c7a2d467c016..befe7d75104b 100644 --- a/proxy/src/console/mgmt.rs +++ b/proxy/src/console/mgmt.rs @@ -6,8 +6,9 @@ use anyhow::Context; use once_cell::sync::Lazy; use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError}; use pq_proto::{BeMessage, SINGLE_COL_ROWDESC}; -use std::{convert::Infallible, future}; +use std::convert::Infallible; use tokio::net::{TcpListener, TcpStream}; +use tokio_util::sync::CancellationToken; use tracing::{error, info, info_span, Instrument}; static CPLANE_WAITERS: Lazy> = Lazy::new(Default::default); @@ -67,7 +68,9 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result { async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> { let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?; - pgbackend.run(&mut MgmtHandler, future::pending::<()>).await + pgbackend + .run(&mut MgmtHandler, &CancellationToken::new()) + .await } /// A message received by `mgmt` when a compute node is ready. diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index bec55a83435f..7a9637066fb1 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -2,7 +2,7 @@ pub mod mock; pub mod neon; -use super::messages::MetricsAuxInfo; +use super::messages::{ConsoleError, MetricsAuxInfo}; use crate::{ auth::{ backend::{ComputeCredentialKeys, ComputeUserInfo}, @@ -317,8 +317,8 @@ impl NodeInfo { } } -pub type NodeInfoCache = TimedLru; -pub type CachedNodeInfo = Cached<&'static NodeInfoCache>; +pub type NodeInfoCache = TimedLru>>; +pub type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>; pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option>; pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc>>; diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 41bd2f49567e..a6e67be22f13 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -9,7 +9,7 @@ use super::{ use crate::{ auth::backend::ComputeUserInfo, compute, - console::messages::ColdStartInfo, + console::messages::{ColdStartInfo, Reason}, http, metrics::{CacheOutcome, Metrics}, rate_limiter::EndpointRateLimiter, @@ -17,10 +17,10 @@ use crate::{ }; use crate::{cache::Cached, context::RequestMonitoring}; use futures::TryFutureExt; -use std::sync::Arc; +use std::{sync::Arc, time::Duration}; use tokio::time::Instant; use tokio_postgres::config::SslMode; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{debug, error, info, info_span, warn, Instrument}; pub struct Api { endpoint: http::Endpoint, @@ -273,26 +273,34 @@ impl super::Api for Api { ) -> Result { let key = user_info.endpoint_cache_key(); + macro_rules! check_cache { + () => { + if let Some(cached) = self.caches.node_info.get(&key) { + let (cached, info) = cached.take_value(); + let info = info.map_err(|c| { + info!(key = &*key, "found cached wake_compute error"); + WakeComputeError::ApiError(ApiError::Console(*c)) + })?; + + debug!(key = &*key, "found cached compute node info"); + ctx.set_project(info.aux.clone()); + return Ok(cached.map(|()| info)); + } + }; + } + // Every time we do a wakeup http request, the compute node will stay up // for some time (highly depends on the console's scale-to-zero policy); // The connection info remains the same during that period of time, // which means that we might cache it to reduce the load and latency. - if let Some(cached) = self.caches.node_info.get(&key) { - info!(key = &*key, "found cached compute node info"); - ctx.set_project(cached.aux.clone()); - return Ok(cached); - } + check_cache!(); let permit = self.locks.get_permit(&key).await?; // after getting back a permit - it's possible the cache was filled // double check if permit.should_check_cache() { - if let Some(cached) = self.caches.node_info.get(&key) { - info!(key = &*key, "found cached compute node info"); - ctx.set_project(cached.aux.clone()); - return Ok(cached); - } + check_cache!(); } // check rate limit @@ -300,23 +308,56 @@ impl super::Api for Api { .wake_compute_endpoint_rate_limiter .check(user_info.endpoint.normalize_intern(), 1) { - info!(key = &*key, "found cached compute node info"); return Err(WakeComputeError::TooManyConnections); } - let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?; - ctx.set_project(node.aux.clone()); - let cold_start_info = node.aux.cold_start_info; - info!("woken up a compute node"); + let node = permit.release_result(self.do_wake_compute(ctx, user_info).await); + match node { + Ok(node) => { + ctx.set_project(node.aux.clone()); + debug!(key = &*key, "created a cache entry for woken compute node"); - // store the cached node as 'warm' - node.aux.cold_start_info = ColdStartInfo::WarmCached; - let (_, mut cached) = self.caches.node_info.insert(key.clone(), node); - cached.aux.cold_start_info = cold_start_info; + let mut stored_node = node.clone(); + // store the cached node as 'warm_cached' + stored_node.aux.cold_start_info = ColdStartInfo::WarmCached; - info!(key = &*key, "created a cache entry for compute node info"); + let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node)); - Ok(cached) + Ok(cached.map(|()| node)) + } + Err(err) => match err { + WakeComputeError::ApiError(ApiError::Console(err)) => { + let Some(status) = &err.status else { + return Err(WakeComputeError::ApiError(ApiError::Console(err))); + }; + + let reason = status + .details + .error_info + .map_or(Reason::Unknown, |x| x.reason); + + // if we can retry this error, do not cache it. + if reason.can_retry() { + return Err(WakeComputeError::ApiError(ApiError::Console(err))); + } + + // at this point, we should only have quota errors. + debug!( + key = &*key, + "created a cache entry for the wake compute error" + ); + + self.caches.node_info.insert_ttl( + key, + Err(Box::new(err.clone())), + Duration::from_secs(30), + ); + + Err(WakeComputeError::ApiError(ApiError::Console(err))) + } + err => return Err(err), + }, + } } } diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs index e72bf199e362..cfc1f8e89e3f 100644 --- a/proxy/src/context/parquet.rs +++ b/proxy/src/context/parquet.rs @@ -14,17 +14,14 @@ use parquet::{ record::RecordWriter, }; use pq_proto::StartupMessageParams; -use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel}; +use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel}; use serde::ser::SerializeMap; use tokio::{sync::mpsc, time}; use tokio_util::sync::CancellationToken; use tracing::{debug, info, Span}; use utils::backoff; -use crate::{ - config::{remote_storage_from_toml, OptRemoteStorageConfig}, - context::LOG_CHAN_DISCONNECT, -}; +use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT}; use super::{RequestMonitoring, LOG_CHAN}; @@ -33,11 +30,11 @@ pub struct ParquetUploadArgs { /// Storage location to upload the parquet files to. /// Encoded as toml (same format as pageservers), eg /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}` - #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] - parquet_upload_remote_storage: OptRemoteStorageConfig, + #[clap(long, value_parser = remote_storage_from_toml)] + parquet_upload_remote_storage: Option, - #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)] - parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig, + #[clap(long, value_parser = remote_storage_from_toml)] + parquet_upload_disconnect_events_remote_storage: Option, /// How many rows to include in a row group #[clap(long, default_value_t = 8192)] diff --git a/proxy/src/http.rs b/proxy/src/http.rs index fc7400869ffa..dd7164181d94 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -4,14 +4,11 @@ pub mod health_server; -use std::{str::FromStr, sync::Arc, time::Duration}; +use std::time::Duration; -use futures::FutureExt; pub use reqwest::{Request, Response, StatusCode}; pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; -use tokio::time::Instant; -use tracing::trace; use crate::{ metrics::{ConsoleRequest, Metrics}, @@ -24,8 +21,6 @@ use reqwest_middleware::RequestBuilder; /// We deliberately don't want to replace this with a public static. pub fn new_client() -> ClientWithMiddleware { let client = reqwest::ClientBuilder::new() - .dns_resolver(Arc::new(GaiResolver::default())) - .connection_verbose(true) .build() .expect("Failed to create http client"); @@ -36,8 +31,6 @@ pub fn new_client() -> ClientWithMiddleware { pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware { let timeout_client = reqwest::ClientBuilder::new() - .dns_resolver(Arc::new(GaiResolver::default())) - .connection_verbose(true) .timeout(default_timout) .build() .expect("Failed to create http client with timeout"); @@ -103,38 +96,6 @@ impl Endpoint { } } -use hyper_util::client::legacy::connect::dns::{ - GaiResolver as HyperGaiResolver, Name as HyperName, -}; -use reqwest::dns::{Addrs, Name, Resolve, Resolving}; -/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html -use tower_service::Service; -#[derive(Debug)] -pub struct GaiResolver(HyperGaiResolver); - -impl Default for GaiResolver { - fn default() -> Self { - Self(HyperGaiResolver::new()) - } -} - -impl Resolve for GaiResolver { - fn resolve(&self, name: Name) -> Resolving { - let this = &mut self.0.clone(); - let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid"); - let start = Instant::now(); - Box::pin( - Service::::call(this, hyper_name).map(move |result| { - let resolve_duration = start.elapsed(); - trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete"); - result - .map(|addrs| -> Addrs { Box::new(addrs) }) - .map_err(|err| -> Box { Box::new(err) }) - }), - ) - } -} - #[cfg(test)] mod tests { use super::*; diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs index 3243e6a14010..d307d80f4af9 100644 --- a/proxy/src/jemalloc.rs +++ b/proxy/src/jemalloc.rs @@ -3,8 +3,8 @@ use std::marker::PhantomData; use measured::{ label::NoLabels, metric::{ - gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder, - MetricEncoding, MetricFamilyEncoding, MetricType, + gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding, + MetricFamilyEncoding, MetricType, }, text::TextEncoder, LabelGroup, MetricGroup, @@ -100,7 +100,7 @@ macro_rules! jemalloc_gauge { enc: &mut TextEncoder, ) -> Result<(), std::io::Error> { if let Ok(v) = mib.read() { - enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?; + GaugeState::new(v as i64).collect_into(&(), labels, name, enc)?; } Ok(()) } diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 3405b8cbc672..3b30ad8b4663 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -15,7 +15,8 @@ use tracing_subscriber::{ pub async fn init() -> anyhow::Result { let env_filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) - .from_env_lossy(); + .from_env_lossy() + .add_directive("azure_core::policies::transport=off".parse().unwrap()); let fmt_layer = tracing_subscriber::fmt::layer() .with_ansi(false) diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index e2a75a872009..db25ac031115 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -2,7 +2,7 @@ use std::sync::{Arc, OnceLock}; use lasso::ThreadedRodeo; use measured::{ - label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet}, + label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet}, metric::{histogram::Thresholds, name::MetricName}, Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec, LabelGroup, MetricGroup, @@ -577,6 +577,32 @@ impl LabelGroup for ThreadPoolWorkerId { } } +impl LabelGroupSet for ThreadPoolWorkers { + type Group<'a> = ThreadPoolWorkerId; + + fn cardinality(&self) -> Option { + Some(self.0) + } + + fn encode_dense(&self, value: Self::Unique) -> Option { + Some(value) + } + + fn decode_dense(&self, value: usize) -> Self::Group<'_> { + ThreadPoolWorkerId(value) + } + + type Unique = usize; + + fn encode(&self, value: Self::Group<'_>) -> Option { + Some(value.0) + } + + fn decode(&self, value: &Self::Unique) -> Self::Group<'_> { + ThreadPoolWorkerId(*value) + } +} + impl LabelSet for ThreadPoolWorkers { type Value<'a> = ThreadPoolWorkerId; diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs index dd935cc24528..d488aea9275b 100644 --- a/proxy/src/proxy/handshake.rs +++ b/proxy/src/proxy/handshake.rs @@ -1,11 +1,17 @@ -use pq_proto::{BeMessage as Be, CancelKeyData, FeStartupPacket, StartupMessageParams}; +use bytes::Buf; +use pq_proto::{ + framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, + StartupMessageParams, +}; use thiserror::Error; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::info; +use tracing::{info, warn}; use crate::{ - config::TlsConfig, + auth::endpoint_sni, + config::{TlsConfig, PG_ALPN_PROTOCOL}, error::ReportableError, + metrics::Metrics, proxy::ERR_INSECURE_CONNECTION, stream::{PqStream, Stream, StreamUpgradeError}, }; @@ -68,6 +74,9 @@ pub async fn handshake( // Client may try upgrading to each protocol only once let (mut tried_ssl, mut tried_gss) = (false, false); + const PG_PROTOCOL_EARLIEST: ProtocolVersion = ProtocolVersion::new(3, 0); + const PG_PROTOCOL_LATEST: ProtocolVersion = ProtocolVersion::new(3, 0); + let mut stream = PqStream::new(Stream::from_raw(stream)); loop { let msg = stream.read_startup_packet().await?; @@ -75,40 +84,96 @@ pub async fn handshake( use FeStartupPacket::*; match msg { - SslRequest => match stream.get_ref() { + SslRequest { direct } => match stream.get_ref() { Stream::Raw { .. } if !tried_ssl => { tried_ssl = true; // We can't perform TLS handshake without a config - let enc = tls.is_some(); - stream.write_message(&Be::EncryptionResponse(enc)).await?; + let have_tls = tls.is_some(); + if !direct { + stream + .write_message(&Be::EncryptionResponse(have_tls)) + .await?; + } else if !have_tls { + return Err(HandshakeError::ProtocolViolation); + } + if let Some(tls) = tls.take() { // Upgrade raw stream into a secure TLS-backed stream. // NOTE: We've consumed `tls`; this fact will be used later. - let (raw, read_buf) = stream.into_inner(); - // TODO: Normally, client doesn't send any data before - // server says TLS handshake is ok and read_buf is empy. - // However, you could imagine pipelining of postgres - // SSLRequest + TLS ClientHello in one hunk similar to - // pipelining in our node js driver. We should probably - // support that by chaining read_buf with the stream. + let Framed { + stream: raw, + read_buf, + write_buf, + } = stream.framed; + + let Stream::Raw { raw } = raw else { + return Err(HandshakeError::StreamUpgradeError( + StreamUpgradeError::AlreadyTls, + )); + }; + + let mut read_buf = read_buf.reader(); + let mut res = Ok(()); + let accept = tokio_rustls::TlsAcceptor::from(tls.to_server_config()) + .accept_with(raw, |session| { + // push the early data to the tls session + while !read_buf.get_ref().is_empty() { + match session.read_tls(&mut read_buf) { + Ok(_) => {} + Err(e) => { + res = Err(e); + break; + } + } + } + }); + + res?; + + let read_buf = read_buf.into_inner(); if !read_buf.is_empty() { return Err(HandshakeError::EarlyData); } - let tls_stream = raw - .upgrade(tls.to_server_config(), record_handshake_error) - .await?; + + let tls_stream = accept.await.inspect_err(|_| { + if record_handshake_error { + Metrics::get().proxy.tls_handshake_failures.inc() + } + })?; + + let conn_info = tls_stream.get_ref().1; + + // check the ALPN, if exists, as required. + match conn_info.alpn_protocol() { + None | Some(PG_ALPN_PROTOCOL) => {} + Some(other) => { + // try parse ep for better error + let ep = conn_info.server_name().and_then(|sni| { + endpoint_sni(sni, &tls.common_names).ok().flatten() + }); + let alpn = String::from_utf8_lossy(other); + warn!(?ep, %alpn, "unexpected ALPN"); + return Err(HandshakeError::ProtocolViolation); + } + } let (_, tls_server_end_point) = tls .cert_resolver - .resolve(tls_stream.get_ref().1.server_name()) + .resolve(conn_info.server_name()) .ok_or(HandshakeError::MissingCertificate)?; - stream = PqStream::new(Stream::Tls { - tls: Box::new(tls_stream), - tls_server_end_point, - }); + stream = PqStream { + framed: Framed { + stream: Stream::Tls { + tls: Box::new(tls_stream), + tls_server_end_point, + }, + read_buf, + write_buf, + }, + }; } } _ => return Err(HandshakeError::ProtocolViolation), @@ -122,7 +187,9 @@ pub async fn handshake( } _ => return Err(HandshakeError::ProtocolViolation), }, - StartupMessage { params, .. } => { + StartupMessage { params, version } + if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST => + { // Check that the config has been consumed during upgrade // OR we didn't provide it at all (for dev purposes). if tls.is_some() { @@ -131,9 +198,48 @@ pub async fn handshake( .await?; } - info!(session_type = "normal", "successful handshake"); + info!(?version, session_type = "normal", "successful handshake"); break Ok(HandshakeData::Startup(stream, params)); } + // downgrade protocol version + StartupMessage { params, version } + if version.major() == 3 && version > PG_PROTOCOL_LATEST => + { + warn!(?version, "unsupported minor version"); + + // no protocol extensions are supported. + // + let mut unsupported = vec![]; + for (k, _) in params.iter() { + if k.starts_with("_pq_.") { + unsupported.push(k); + } + } + + // TODO: remove unsupported options so we don't send them to compute. + + stream + .write_message(&Be::NegotiateProtocolVersion { + version: PG_PROTOCOL_LATEST, + options: &unsupported, + }) + .await?; + + info!( + ?version, + session_type = "normal", + "successful handshake; unsupported minor version requested" + ); + break Ok(HandshakeData::Startup(stream, params)); + } + StartupMessage { version, .. } => { + warn!( + ?version, + session_type = "normal", + "unsuccessful handshake; unsupported version" + ); + return Err(HandshakeError::ProtocolViolation); + } CancelRequest(cancel_key_data) => { info!(session_type = "cancellation", "successful handshake"); break Ok(HandshakeData::Cancel(cancel_key_data)); diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 8119f39fae6b..5186a9e1b0f1 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -540,8 +540,8 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn }, allow_self_signed_compute: false, }; - let (_, node) = cache.insert("key".into(), node); - node + let (_, node2) = cache.insert_unit("key".into(), Ok(node.clone())); + node2.map(|()| node) } fn helper_create_connect_info( diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 583ff75f7ca7..8118ae5ea89d 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -838,8 +838,9 @@ async fn query_to_json( "finished reading rows" ); - let mut fields = vec![]; - let mut columns = vec![]; + let columns_len = row_stream.columns().len(); + let mut fields = Vec::with_capacity(columns_len); + let mut columns = Vec::with_capacity(columns_len); for c in row_stream.columns() { fields.push(json!({ diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 20650490b1ae..9eb6546d6bae 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -12,7 +12,6 @@ use sd_notify::NotifyState; use tokio::runtime::Handle; use tokio::signal::unix::{signal, SignalKind}; use tokio::task::JoinError; -use toml_edit::Document; use utils::logging::SecretString; use std::env::{var, VarError}; @@ -28,8 +27,9 @@ use utils::pid_file; use metrics::set_build_info_metric; use safekeeper::defaults::{ - DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR, - DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, + DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT, + DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY, + DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR, }; use safekeeper::http; use safekeeper::wal_service; @@ -125,7 +125,7 @@ struct Args { peer_recovery: bool, /// Remote storage configuration for WAL backup (offloading to s3) as TOML /// inline table, e.g. - /// {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "", "bucket_region":"", "concurrency_limit": 119} + /// {max_concurrent_syncs = 17, max_sync_errors = 13, bucket_name = "", bucket_region = "", concurrency_limit = 119} /// Safekeeper offloads WAL to /// [prefix_in_bucket/]//, mirroring /// structure on the file system. @@ -191,6 +191,15 @@ struct Args { /// Pending updates to control file will be automatically saved after this interval. #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_CONTROL_FILE_SAVE_INTERVAL)] control_file_save_interval: Duration, + /// Number of allowed concurrent uploads of partial segments to remote storage. + #[arg(long, default_value = DEFAULT_PARTIAL_BACKUP_CONCURRENCY)] + partial_backup_concurrency: usize, + /// How long a timeline must be resident before it is eligible for eviction. + /// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction, + /// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again, + /// if it weren't for `eviction_min_resident` preventing that. + #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)] + eviction_min_resident: Duration, } // Like PathBufValueParser, but allows empty string. @@ -344,6 +353,8 @@ async fn main() -> anyhow::Result<()> { enable_offload: args.enable_offload, delete_offloaded_wal: args.delete_offloaded_wal, control_file_save_interval: args.control_file_save_interval, + partial_backup_concurrency: args.partial_backup_concurrency, + eviction_min_resident: args.eviction_min_resident, }; // initialize sentry if SENTRY_DSN is provided @@ -441,6 +452,19 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> { .map(|res| ("WAL service main".to_owned(), res)); tasks_handles.push(Box::pin(wal_service_handle)); + let timeline_housekeeping_handle = current_thread_rt + .as_ref() + .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle()) + .spawn(async move { + const TOMBSTONE_TTL: Duration = Duration::from_secs(3600 * 24); + loop { + tokio::time::sleep(TOMBSTONE_TTL).await; + GlobalTimelines::housekeeping(&TOMBSTONE_TTL); + } + }) + .map(|res| ("Timeline map housekeeping".to_owned(), res)); + tasks_handles.push(Box::pin(timeline_housekeeping_handle)); + if let Some(pg_listener_tenant_only) = pg_listener_tenant_only { let conf_ = conf.clone(); let wal_service_handle = current_thread_rt @@ -548,16 +572,8 @@ fn set_id(workdir: &Utf8Path, given_id: Option) -> Result { Ok(my_id) } -// Parse RemoteStorage from TOML table. fn parse_remote_storage(storage_conf: &str) -> anyhow::Result { - // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse - let storage_conf_toml = format!("remote_storage = {storage_conf}"); - let parsed_toml = storage_conf_toml.parse::()?; // parse - let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again - RemoteStorageConfig::from_toml(storage_conf_parsed_toml).and_then(|parsed_config| { - // XXX: Don't print the original toml here, there might be some sensitive data - parsed_config.context("Incorrectly parsed remote storage toml as no remote storage config") - }) + RemoteStorageConfig::from_toml(&storage_conf.parse()?) } #[test] diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 067e425570e7..af83feb77fac 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -52,6 +52,12 @@ pub mod defaults { pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20); pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m"; pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s"; + pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5"; + + // By default, our required residency before eviction is the same as the period that passes + // before uploading a partial segment, so that in normal operation the eviction can happen + // as soon as we have done the partial segment upload. + pub const DEFAULT_EVICTION_MIN_RESIDENT: &str = DEFAULT_PARTIAL_BACKUP_TIMEOUT; } #[derive(Debug, Clone)] @@ -91,6 +97,8 @@ pub struct SafeKeeperConf { pub enable_offload: bool, pub delete_offloaded_wal: bool, pub control_file_save_interval: Duration, + pub partial_backup_concurrency: usize, + pub eviction_min_resident: Duration, } impl SafeKeeperConf { @@ -133,6 +141,8 @@ impl SafeKeeperConf { enable_offload: false, delete_offloaded_wal: false, control_file_save_interval: Duration::from_secs(1), + partial_backup_concurrency: 1, + eviction_min_resident: Duration::ZERO, } } } diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs index 1e965393e397..539ecf826bf8 100644 --- a/safekeeper/src/metrics.rs +++ b/safekeeper/src/metrics.rs @@ -5,15 +5,15 @@ use std::{ time::{Instant, SystemTime}, }; -use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS}; +use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_FSYNC_SECONDS_BUCKETS}; use anyhow::Result; use futures::Future; use metrics::{ core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts}, proto::MetricFamily, - register_int_counter, register_int_counter_pair, register_int_counter_pair_vec, - register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, - IntGaugeVec, + register_histogram_vec, register_int_counter, register_int_counter_pair, + register_int_counter_pair_vec, register_int_counter_vec, Gauge, HistogramVec, IntCounter, + IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec, }; use once_cell::sync::Lazy; @@ -48,7 +48,7 @@ pub static WRITE_WAL_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_write_wal_seconds", "Seconds spent writing and syncing WAL to a disk in a single request", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_write_wal_seconds histogram") }); @@ -56,7 +56,7 @@ pub static FLUSH_WAL_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_flush_wal_seconds", "Seconds spent syncing WAL to a disk", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_flush_wal_seconds histogram") }); @@ -64,10 +64,28 @@ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_persist_control_file_seconds", "Seconds to persist and sync control file", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec") }); +pub static WAL_STORAGE_OPERATION_SECONDS: Lazy = Lazy::new(|| { + register_histogram_vec!( + "safekeeper_wal_storage_operation_seconds", + "Seconds spent on WAL storage operations", + &["operation"], + DISK_FSYNC_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_wal_storage_operation_seconds histogram vec") +}); +pub static MISC_OPERATION_SECONDS: Lazy = Lazy::new(|| { + register_histogram_vec!( + "safekeeper_misc_operation_seconds", + "Seconds spent on miscellaneous operations", + &["operation"], + DISK_FSYNC_SECONDS_BUCKETS.to_vec() + ) + .expect("Failed to register safekeeper_misc_operation_seconds histogram vec") +}); pub static PG_IO_BYTES: Lazy = Lazy::new(|| { register_int_counter_vec!( "safekeeper_pg_io_bytes_total", @@ -126,7 +144,7 @@ pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy = Lazy::new(|| { register_histogram!( "safekeeper_broker_push_update_seconds", "Seconds to push all timeline updates to the broker", - DISK_WRITE_SECONDS_BUCKETS.to_vec() + DISK_FSYNC_SECONDS_BUCKETS.to_vec() ) .expect("Failed to register safekeeper_broker_push_update_seconds histogram vec") }); diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs index 4d0992e8bda9..33ec39b852f4 100644 --- a/safekeeper/src/safekeeper.rs +++ b/safekeeper/src/safekeeper.rs @@ -15,6 +15,7 @@ use storage_broker::proto::SafekeeperTimelineInfo; use tracing::*; use crate::control_file; +use crate::metrics::MISC_OPERATION_SECONDS; use crate::send_wal::HotStandbyFeedback; use crate::state::TimelineState; @@ -696,6 +697,10 @@ where &mut self, msg: &ProposerElected, ) -> Result> { + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["handle_elected"]) + .start_timer(); + info!("received ProposerElected {:?}", msg); if self.state.acceptor_state.term < msg.term { let mut state = self.state.start_change(); diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs index e0f7b65aef84..dca64140827f 100644 --- a/safekeeper/src/state.rs +++ b/safekeeper/src/state.rs @@ -189,7 +189,12 @@ where /// Persist given state. c.f. start_change. pub async fn finish_change(&mut self, s: &TimelinePersistentState) -> Result<()> { - self.pers.persist(s).await?; + if s.eq(&*self.pers) { + // nothing to do if state didn't change + } else { + self.pers.persist(s).await?; + } + // keep in memory values up to date self.inmem.commit_lsn = s.commit_lsn; self.inmem.backup_lsn = s.backup_lsn; diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs index f632cd6fb3ec..132e5ec32f4f 100644 --- a/safekeeper/src/timeline.rs +++ b/safekeeper/src/timeline.rs @@ -36,10 +36,10 @@ use crate::timeline_guard::ResidenceGuard; use crate::timeline_manager::{AtomicStatus, ManagerCtl}; use crate::timelines_set::TimelinesSet; use crate::wal_backup::{self}; -use crate::wal_backup_partial::PartialRemoteSegment; +use crate::wal_backup_partial::{PartialRemoteSegment, RateLimiter}; use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION}; -use crate::metrics::{FullTimelineInfo, WalStorageMetrics}; +use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS}; use crate::wal_storage::{Storage as wal_storage_iface, WalReader}; use crate::{debug_dump, timeline_manager, wal_storage}; use crate::{GlobalTimelines, SafeKeeperConf}; @@ -587,6 +587,7 @@ impl Timeline { shared_state: &mut WriteGuardSharedState<'_>, conf: &SafeKeeperConf, broker_active_set: Arc, + partial_backup_rate_limiter: RateLimiter, ) -> Result<()> { match fs::metadata(&self.timeline_dir).await { Ok(_) => { @@ -617,7 +618,7 @@ impl Timeline { return Err(e); } - self.bootstrap(conf, broker_active_set); + self.bootstrap(conf, broker_active_set, partial_backup_rate_limiter); Ok(()) } @@ -626,6 +627,7 @@ impl Timeline { self: &Arc, conf: &SafeKeeperConf, broker_active_set: Arc, + partial_backup_rate_limiter: RateLimiter, ) { let (tx, rx) = self.manager_ctl.bootstrap_manager(); @@ -637,6 +639,7 @@ impl Timeline { broker_active_set, tx, rx, + partial_backup_rate_limiter, )); } @@ -856,28 +859,40 @@ impl Timeline { } debug!("requesting WalResidentTimeline guard"); - - // Wait 5 seconds for the guard to be acquired, should be enough for uneviction. - // If it times out, most likely there is a deadlock in the manager task. - let res = tokio::time::timeout( - Duration::from_secs(5), + let started_at = Instant::now(); + let status_before = self.mgr_status.get(); + + // Wait 30 seconds for the guard to be acquired. It can time out if someone is + // holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task + // is stuck. + let res = tokio::time::timeout_at( + started_at + Duration::from_secs(30), self.manager_ctl.wal_residence_guard(), ) .await; let guard = match res { - Ok(Ok(guard)) => guard, + Ok(Ok(guard)) => { + let finished_at = Instant::now(); + let elapsed = finished_at - started_at; + MISC_OPERATION_SECONDS + .with_label_values(&["wal_residence_guard"]) + .observe(elapsed.as_secs_f64()); + + guard + } Ok(Err(e)) => { warn!( - "error while acquiring WalResidentTimeline guard (current state {:?}): {}", - self.mgr_status.get(), - e + "error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}", + status_before, + self.mgr_status.get() ); return Err(e); } Err(_) => { warn!( - "timeout while acquiring WalResidentTimeline guard (current state {:?})", + "timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}", + status_before, self.mgr_status.get() ); anyhow::bail!("timeout while acquiring WalResidentTimeline guard"); diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs index b303d41b7bab..e4ab65290d52 100644 --- a/safekeeper/src/timeline_eviction.rs +++ b/safekeeper/src/timeline_eviction.rs @@ -5,6 +5,7 @@ use anyhow::Context; use camino::Utf8PathBuf; use remote_storage::RemotePath; +use std::time::Instant; use tokio::{ fs::File, io::{AsyncRead, AsyncWriteExt}, @@ -48,6 +49,7 @@ impl Manager { .flush_lsn .segment_number(self.wal_seg_size) == self.last_removed_segno + 1 + && self.resident_since.elapsed() >= self.conf.eviction_min_resident } /// Evict the timeline to remote storage. @@ -91,6 +93,8 @@ impl Manager { return; } + self.resident_since = Instant::now(); + info!("successfully restored evicted timeline"); } } diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index c3abeac6449f..debf8c824f2d 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -22,7 +22,7 @@ use utils::lsn::Lsn; use crate::{ control_file::{FileStorage, Storage}, - metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL}, + metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS}, recovery::recovery_main, remove_wal::calc_horizon_lsn, safekeeper::Term, @@ -32,7 +32,7 @@ use crate::{ timeline_guard::{AccessService, GuardId, ResidenceGuard}, timelines_set::{TimelineSetGuard, TimelinesSet}, wal_backup::{self, WalBackupTaskHandle}, - wal_backup_partial::{self, PartialRemoteSegment}, + wal_backup_partial::{self, PartialRemoteSegment, RateLimiter}, SafeKeeperConf, }; @@ -185,6 +185,11 @@ pub(crate) struct Manager { // misc pub(crate) access_service: AccessService, + pub(crate) partial_backup_rate_limiter: RateLimiter, + + // Anti-flapping state: we evict timelines eagerly if they are inactive, but should not + // evict them if they go inactive very soon after being restored. + pub(crate) resident_since: std::time::Instant, } /// This task gets spawned alongside each timeline and is responsible for managing the timeline's @@ -197,6 +202,7 @@ pub async fn main_task( broker_active_set: Arc, manager_tx: tokio::sync::mpsc::UnboundedSender, mut manager_rx: tokio::sync::mpsc::UnboundedReceiver, + partial_backup_rate_limiter: RateLimiter, ) { tli.set_status(Status::Started); @@ -209,7 +215,14 @@ pub async fn main_task( } }; - let mut mgr = Manager::new(tli, conf, broker_active_set, manager_tx).await; + let mut mgr = Manager::new( + tli, + conf, + broker_active_set, + manager_tx, + partial_backup_rate_limiter, + ) + .await; // Start recovery task which always runs on the timeline. if !mgr.is_offloaded && mgr.conf.peer_recovery_enabled { @@ -321,6 +334,7 @@ impl Manager { conf: SafeKeeperConf, broker_active_set: Arc, manager_tx: tokio::sync::mpsc::UnboundedSender, + partial_backup_rate_limiter: RateLimiter, ) -> Manager { let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await; Manager { @@ -339,6 +353,8 @@ impl Manager { partial_backup_uploaded, access_service: AccessService::new(manager_tx), tli, + partial_backup_rate_limiter, + resident_since: std::time::Instant::now(), } } @@ -357,6 +373,10 @@ impl Manager { /// Get a snapshot of the timeline state. async fn state_snapshot(&self) -> StateSnapshot { + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["state_snapshot"]) + .start_timer(); + StateSnapshot::new( self.tli.read_shared_state().await, self.conf.heartbeat_timeout, @@ -521,6 +541,7 @@ impl Manager { self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task( self.wal_resident_timeline(), self.conf.clone(), + self.partial_backup_rate_limiter.clone(), ))); } diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs index 45e08ede3c0a..f57da5c7cbf1 100644 --- a/safekeeper/src/timelines_global_map.rs +++ b/safekeeper/src/timelines_global_map.rs @@ -5,6 +5,7 @@ use crate::safekeeper::ServerInfo; use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError}; use crate::timelines_set::TimelinesSet; +use crate::wal_backup_partial::RateLimiter; use crate::SafeKeeperConf; use anyhow::{bail, Context, Result}; use camino::Utf8PathBuf; @@ -14,15 +15,23 @@ use std::collections::HashMap; use std::str::FromStr; use std::sync::atomic::Ordering; use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; use tracing::*; use utils::id::{TenantId, TenantTimelineId, TimelineId}; use utils::lsn::Lsn; struct GlobalTimelinesState { timelines: HashMap>, + + // A tombstone indicates this timeline used to exist has been deleted. These are used to prevent + // on-demand timeline creation from recreating deleted timelines. This is only soft-enforced, as + // this map is dropped on restart. + tombstones: HashMap, + conf: Option, broker_active_set: Arc, load_lock: Arc>, + partial_backup_rate_limiter: RateLimiter, } // Used to prevent concurrent timeline loading. @@ -37,8 +46,12 @@ impl GlobalTimelinesState { } /// Get dependencies for a timeline constructor. - fn get_dependencies(&self) -> (SafeKeeperConf, Arc) { - (self.get_conf().clone(), self.broker_active_set.clone()) + fn get_dependencies(&self) -> (SafeKeeperConf, Arc, RateLimiter) { + ( + self.get_conf().clone(), + self.broker_active_set.clone(), + self.partial_backup_rate_limiter.clone(), + ) } /// Insert timeline into the map. Returns error if timeline with the same id already exists. @@ -58,14 +71,21 @@ impl GlobalTimelinesState { .cloned() .ok_or(TimelineError::NotFound(*ttid)) } + + fn delete(&mut self, ttid: TenantTimelineId) { + self.timelines.remove(&ttid); + self.tombstones.insert(ttid, Instant::now()); + } } static TIMELINES_STATE: Lazy> = Lazy::new(|| { Mutex::new(GlobalTimelinesState { timelines: HashMap::new(), + tombstones: HashMap::new(), conf: None, broker_active_set: Arc::new(TimelinesSet::default()), load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)), + partial_backup_rate_limiter: RateLimiter::new(1), }) }); @@ -79,6 +99,7 @@ impl GlobalTimelines { // lock, so use explicit block let tenants_dir = { let mut state = TIMELINES_STATE.lock().unwrap(); + state.partial_backup_rate_limiter = RateLimiter::new(conf.partial_backup_concurrency); state.conf = Some(conf); // Iterate through all directories and load tenants for all directories @@ -122,7 +143,7 @@ impl GlobalTimelines { /// this function is called during init when nothing else is running, so /// this is fine. async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> { - let (conf, broker_active_set) = { + let (conf, broker_active_set, partial_backup_rate_limiter) = { let state = TIMELINES_STATE.lock().unwrap(); state.get_dependencies() }; @@ -145,7 +166,11 @@ impl GlobalTimelines { .unwrap() .timelines .insert(ttid, tli.clone()); - tli.bootstrap(&conf, broker_active_set.clone()); + tli.bootstrap( + &conf, + broker_active_set.clone(), + partial_backup_rate_limiter.clone(), + ); } // If we can't load a timeline, it's most likely because of a corrupted // directory. We will log an error and won't allow to delete/recreate @@ -178,20 +203,27 @@ impl GlobalTimelines { _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>, ttid: TenantTimelineId, ) -> Result> { - let (conf, broker_active_set) = TIMELINES_STATE.lock().unwrap().get_dependencies(); + let (conf, broker_active_set, partial_backup_rate_limiter) = + TIMELINES_STATE.lock().unwrap().get_dependencies(); match Timeline::load_timeline(&conf, ttid) { Ok(timeline) => { let tli = Arc::new(timeline); // TODO: prevent concurrent timeline creation/loading - TIMELINES_STATE - .lock() - .unwrap() - .timelines - .insert(ttid, tli.clone()); + { + let mut state = TIMELINES_STATE.lock().unwrap(); + + // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`). We trust + // that the human doing this manual intervention knows what they are doing, and remove its tombstone. + if state.tombstones.remove(&ttid).is_some() { + warn!("Un-deleted timeline {ttid}"); + } + + state.timelines.insert(ttid, tli.clone()); + } - tli.bootstrap(&conf, broker_active_set); + tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter); Ok(tli) } @@ -216,18 +248,23 @@ impl GlobalTimelines { /// Create a new timeline with the given id. If the timeline already exists, returns /// an existing timeline. - pub async fn create( + pub(crate) async fn create( ttid: TenantTimelineId, server_info: ServerInfo, commit_lsn: Lsn, local_start_lsn: Lsn, ) -> Result> { - let (conf, broker_active_set) = { + let (conf, broker_active_set, partial_backup_rate_limiter) = { let state = TIMELINES_STATE.lock().unwrap(); if let Ok(timeline) = state.get(&ttid) { // Timeline already exists, return it. return Ok(timeline); } + + if state.tombstones.contains_key(&ttid) { + anyhow::bail!("Timeline {ttid} is deleted, refusing to recreate"); + } + state.get_dependencies() }; @@ -257,7 +294,12 @@ impl GlobalTimelines { // Bootstrap is transactional, so if it fails, the timeline will be deleted, // and the state on disk should remain unchanged. if let Err(e) = timeline - .init_new(&mut shared_state, &conf, broker_active_set) + .init_new( + &mut shared_state, + &conf, + broker_active_set, + partial_backup_rate_limiter, + ) .await { // Note: the most likely reason for init failure is that the timeline @@ -282,17 +324,19 @@ impl GlobalTimelines { /// Get a timeline from the global map. If it's not present, it doesn't exist on disk, /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid, /// i.e. loaded in memory and not cancelled. - pub fn get(ttid: TenantTimelineId) -> Result, TimelineError> { - let res = TIMELINES_STATE.lock().unwrap().get(&ttid); - - match res { + pub(crate) fn get(ttid: TenantTimelineId) -> Result, TimelineError> { + let tli_res = { + let state = TIMELINES_STATE.lock().unwrap(); + state.get(&ttid) + }; + match tli_res { Ok(tli) => { if tli.is_cancelled() { return Err(TimelineError::Cancelled(ttid)); } Ok(tli) } - _ => res, + _ => tli_res, } } @@ -321,12 +365,26 @@ impl GlobalTimelines { /// Cancels timeline, then deletes the corresponding data directory. /// If only_local, doesn't remove WAL segments in remote storage. - pub async fn delete( + pub(crate) async fn delete( ttid: &TenantTimelineId, only_local: bool, ) -> Result { - let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid); - match tli_res { + let tli_res = { + let state = TIMELINES_STATE.lock().unwrap(); + + if state.tombstones.contains_key(ttid) { + // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do. + info!("Timeline {ttid} was already deleted"); + return Ok(TimelineDeleteForceResult { + dir_existed: false, + was_active: false, + }); + } + + state.get(ttid) + }; + + let result = match tli_res { Ok(timeline) => { let was_active = timeline.broker_active.load(Ordering::Relaxed); @@ -336,11 +394,6 @@ impl GlobalTimelines { info!("deleting timeline {}, only_local={}", ttid, only_local); let dir_existed = timeline.delete(&mut shared_state, only_local).await?; - // Remove timeline from the map. - // FIXME: re-enable it once we fix the issue with recreation of deleted timelines - // https://github.com/neondatabase/neon/issues/3146 - // TIMELINES_STATE.lock().unwrap().timelines.remove(ttid); - Ok(TimelineDeleteForceResult { dir_existed, was_active, // TODO: we probably should remove this field @@ -356,7 +409,14 @@ impl GlobalTimelines { was_active: false, }) } - } + }; + + // Finalize deletion, by dropping Timeline objects and storing smaller tombstones. The tombstones + // are used to prevent still-running computes from re-creating the same timeline when they send data, + // and to speed up repeated deletion calls by avoiding re-listing objects. + TIMELINES_STATE.lock().unwrap().delete(*ttid); + + result } /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which @@ -402,19 +462,20 @@ impl GlobalTimelines { tenant_id, ))?; - // FIXME: we temporarily disabled removing timelines from the map, see `delete_force` - // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id); - // if !tlis_after_delete.is_empty() { - // // Some timelines were created while we were deleting them, returning error - // // to the caller, so it can retry later. - // bail!( - // "failed to delete all timelines for tenant {}: some timelines were created while we were deleting them", - // tenant_id - // ); - // } - Ok(deleted) } + + pub fn housekeeping(tombstone_ttl: &Duration) { + let mut state = TIMELINES_STATE.lock().unwrap(); + + // We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted + // timelines. If a compute kept running for longer than this TTL (or across a safekeeper restart) then they + // may recreate a deleted timeline. + let now = Instant::now(); + state + .tombstones + .retain(|_, v| now.duration_since(*v) < *tombstone_ttl); + } } #[derive(Clone, Copy, Serialize)] diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs index 9c7cd0888d83..825851c97c9a 100644 --- a/safekeeper/src/wal_backup_partial.rs +++ b/safekeeper/src/wal_backup_partial.rs @@ -18,6 +18,8 @@ //! This way control file stores information about all potentially existing //! remote partial segments and can clean them up after uploading a newer version. +use std::sync::Arc; + use camino::Utf8PathBuf; use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI}; use remote_storage::RemotePath; @@ -27,7 +29,7 @@ use tracing::{debug, error, info, instrument, warn}; use utils::lsn::Lsn; use crate::{ - metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, + metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS}, safekeeper::Term, timeline::WalResidentTimeline, timeline_manager::StateSnapshot, @@ -35,6 +37,30 @@ use crate::{ SafeKeeperConf, }; +#[derive(Clone)] +pub struct RateLimiter { + semaphore: Arc, +} + +impl RateLimiter { + pub fn new(permits: usize) -> Self { + Self { + semaphore: Arc::new(tokio::sync::Semaphore::new(permits)), + } + } + + async fn acquire_owned(&self) -> tokio::sync::OwnedSemaphorePermit { + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["partial_permit_acquire"]) + .start_timer(); + self.semaphore + .clone() + .acquire_owned() + .await + .expect("semaphore is closed") + } +} + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)] pub enum UploadStatus { /// Upload is in progress. This status should be used only for garbage collection, @@ -208,6 +234,9 @@ impl PartialBackup { /// Upload the latest version of the partial segment and garbage collect older versions. #[instrument(name = "upload", skip_all, fields(name = %prepared.name))] async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> { + let _timer = MISC_OPERATION_SECONDS + .with_label_values(&["partial_do_upload"]) + .start_timer(); info!("starting upload {:?}", prepared); let state_0 = self.state.clone(); @@ -307,6 +336,7 @@ pub(crate) fn needs_uploading( pub async fn main_task( tli: WalResidentTimeline, conf: SafeKeeperConf, + limiter: RateLimiter, ) -> Option { debug!("started"); let await_duration = conf.partial_backup_timeout; @@ -411,6 +441,9 @@ pub async fn main_task( continue 'outer; } + // limit concurrent uploads + let _upload_permit = limiter.acquire_owned().await; + let prepared = backup.prepare_upload().await; if let Some(seg) = &uploaded_segment { if seg.eq_without_status(&prepared) { diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs index 4a97eb3993f3..091571111e5c 100644 --- a/safekeeper/src/wal_service.rs +++ b/safekeeper/src/wal_service.rs @@ -4,9 +4,10 @@ //! use anyhow::{Context, Result}; use postgres_backend::QueryError; -use std::{future, time::Duration}; +use std::time::Duration; use tokio::net::TcpStream; use tokio_io_timeout::TimeoutReader; +use tokio_util::sync::CancellationToken; use tracing::*; use utils::{auth::Scope, measured_stream::MeasuredStream}; @@ -100,7 +101,7 @@ async fn handle_socket( // libpq protocol between safekeeper and walproposer / pageserver // We don't use shutdown. pgbackend - .run(&mut conn_handler, future::pending::<()>) + .run(&mut conn_handler, &CancellationToken::new()) .await } diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs index 74c4693ccd9b..ded8571a3e27 100644 --- a/safekeeper/src/wal_storage.rs +++ b/safekeeper/src/wal_storage.rs @@ -23,7 +23,9 @@ use tokio::io::{AsyncReadExt, AsyncSeekExt}; use tracing::*; use utils::crashsafe::durable_rename; -use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS}; +use crate::metrics::{ + time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS, +}; use crate::state::TimelinePersistentState; use crate::wal_backup::{read_object, remote_timeline_path}; use crate::SafeKeeperConf; @@ -331,6 +333,10 @@ impl Storage for PhysicalStorage { } async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> { + let _timer = WAL_STORAGE_OPERATION_SECONDS + .with_label_values(&["initialize_first_segment"]) + .start_timer(); + let segno = init_lsn.segment_number(self.wal_seg_size); let (mut file, _) = self.open_or_create(segno).await?; let major_pg_version = self.pg_version / 10000; @@ -422,6 +428,10 @@ impl Storage for PhysicalStorage { /// Truncate written WAL by removing all WAL segments after the given LSN. /// end_pos must point to the end of the WAL record. async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> { + let _timer = WAL_STORAGE_OPERATION_SECONDS + .with_label_values(&["truncate_wal"]) + .start_timer(); + // Streaming must not create a hole, so truncate cannot be called on non-written lsn if self.write_lsn != Lsn(0) && end_pos > self.write_lsn { bail!( @@ -497,6 +507,10 @@ async fn remove_segments_from_disk( wal_seg_size: usize, remove_predicate: impl Fn(XLogSegNo) -> bool, ) -> Result<()> { + let _timer = WAL_STORAGE_OPERATION_SECONDS + .with_label_values(&["remove_segments_from_disk"]) + .start_timer(); + let mut n_removed = 0; let mut min_removed = u64::MAX; let mut max_removed = u64::MIN; diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 43835c7f4411..0c6d97ddfaad 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -187,6 +187,8 @@ pub fn run_server(os: NodeOs, disk: Arc) -> Result<()> { enable_offload: false, delete_offloaded_wal: false, control_file_save_interval: Duration::from_secs(1), + partial_backup_concurrency: 1, + eviction_min_resident: Duration::ZERO, }; let mut global = GlobalMap::new(disk, conf.clone())?; diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs index 74b7e7c84955..6f1355eb6848 100644 --- a/storage_controller/src/background_node_operations.rs +++ b/storage_controller/src/background_node_operations.rs @@ -3,7 +3,7 @@ use std::{borrow::Cow, fmt::Debug, fmt::Display}; use tokio_util::sync::CancellationToken; use utils::id::NodeId; -pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 10; +pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32; #[derive(Copy, Clone)] pub(crate) struct Drain { diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs index 4d0f8006aaa4..c46539485c1f 100644 --- a/storage_controller/src/compute_hook.rs +++ b/storage_controller/src/compute_hook.rs @@ -323,7 +323,7 @@ impl ComputeHook { if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running { tracing::info!("Reconfiguring endpoint {}", endpoint_name,); endpoint - .reconfigure(compute_pageservers.clone(), *stripe_size) + .reconfigure(compute_pageservers.clone(), *stripe_size, None) .await .map_err(NotifyError::NeonLocal)?; } diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs index 680e6f09c4a0..7446ad53a231 100644 --- a/storage_controller/src/http.rs +++ b/storage_controller/src/http.rs @@ -10,8 +10,9 @@ use hyper::header::CONTENT_TYPE; use hyper::{Body, Request, Response}; use hyper::{StatusCode, Uri}; use metrics::{BuildInfo, NeonMetrics}; +use pageserver_api::controller_api::TenantCreateRequest; use pageserver_api::models::{ - TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest, + TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest, TimelineCreateRequest, }; use pageserver_api::shard::TenantShardId; diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs index 4d17dff9feaf..fff44aaf2670 100644 --- a/storage_controller/src/node.rs +++ b/storage_controller/src/node.rs @@ -226,7 +226,7 @@ impl Node { fn is_fatal(e: &mgmt_api::Error) -> bool { use mgmt_api::Error::*; match e { - ReceiveBody(_) | ReceiveErrorBody(_) => false, + SendRequest(_) | ReceiveBody(_) | ReceiveErrorBody(_) => false, ApiError(StatusCode::SERVICE_UNAVAILABLE, _) | ApiError(StatusCode::GATEWAY_TIMEOUT, _) | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false, diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index fe97f724c132..886ceae90fbf 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -1,6 +1,7 @@ use crate::pageserver_client::PageserverClient; use crate::persistence::Persistence; use crate::service; +use pageserver_api::controller_api::PlacementPolicy; use pageserver_api::models::{ LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig, }; @@ -29,6 +30,7 @@ pub(super) struct Reconciler { /// of a tenant's state from when we spawned a reconcile task. pub(super) tenant_shard_id: TenantShardId, pub(crate) shard: ShardIdentity, + pub(crate) placement_policy: PlacementPolicy, pub(crate) generation: Option, pub(crate) intent: TargetState, @@ -641,7 +643,7 @@ impl Reconciler { generation, &self.shard, &self.config, - !self.intent.secondary.is_empty(), + &self.placement_policy, ); match self.observed.locations.get(&node.get_id()) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => { @@ -801,8 +803,15 @@ pub(crate) fn attached_location_conf( generation: Generation, shard: &ShardIdentity, config: &TenantConfig, - has_secondaries: bool, + policy: &PlacementPolicy, ) -> LocationConfig { + let has_secondaries = match policy { + PlacementPolicy::Attached(0) | PlacementPolicy::Detached | PlacementPolicy::Secondary => { + false + } + PlacementPolicy::Attached(_) => true, + }; + LocationConfig { mode: LocationConfigMode::AttachedSingle, generation: generation.into(), diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index e329f42dd610..aada1939eeea 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -32,10 +32,10 @@ use itertools::Itertools; use pageserver_api::{ controller_api::{ NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy, - ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard, - TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, - TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse, - UtilizationScore, + ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, + TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard, + TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest, + TenantShardMigrateResponse, UtilizationScore, }, models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest}, }; @@ -46,10 +46,9 @@ use crate::pageserver_client::PageserverClient; use pageserver_api::{ models::{ self, LocationConfig, LocationConfigListResponse, LocationConfigMode, - PageserverUtilization, ShardParameters, TenantConfig, TenantCreateRequest, - TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation, - TenantShardSplitRequest, TenantShardSplitResponse, TenantTimeTravelRequest, - TimelineCreateRequest, TimelineInfo, + PageserverUtilization, ShardParameters, TenantConfig, TenantLocationConfigRequest, + TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest, + TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo, }, shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId}, upcall_api::{ @@ -152,6 +151,10 @@ struct ServiceState { /// controller API. fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError { match e { + mgmt_api::Error::SendRequest(e) => { + // Presume errors sending requests are connectivity/availability issues + ApiError::ResourceUnavailable(format!("{node} error sending request: {e}").into()) + } mgmt_api::Error::ReceiveErrorBody(str) => { // Presume errors receiving body are connectivity/availability issues ApiError::ResourceUnavailable( @@ -1391,7 +1394,7 @@ impl Service { tenant_shard.generation.unwrap(), &tenant_shard.shard, &tenant_shard.config, - false, + &PlacementPolicy::Attached(0), )), }, )]); @@ -3322,7 +3325,7 @@ impl Service { generation, &child_shard, &config, - matches!(policy, PlacementPolicy::Attached(n) if n > 0), + &policy, )), }, ); @@ -4063,7 +4066,14 @@ impl Service { placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking // There is no way to know what the tenant's config was: revert to defaults - config: TenantConfig::default(), + // + // TODO: remove `switch_aux_file_policy` once we finish auxv2 migration + // + // we write to both v1+v2 storage, so that the test case can use either storage format for testing + config: TenantConfig { + switch_aux_file_policy: Some(models::AuxFilePolicy::CrossValidation), + ..TenantConfig::default() + }, }) .await?; @@ -5564,9 +5574,12 @@ impl Service { break; } - let mut can_take = attached - expected_attached; + let can_take = attached - expected_attached; + let needed = fill_requirement - plan.len(); + let mut take = std::cmp::min(can_take, needed); + let mut remove_node = false; - while can_take > 0 { + while take > 0 { match tids_by_node.get_mut(&node_id) { Some(tids) => match tids.pop() { Some(tid) => { @@ -5578,7 +5591,7 @@ impl Service { if *promoted < max_promote_for_tenant { plan.push(tid); *promoted += 1; - can_take -= 1; + take -= 1; } } None => { diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index 45295bc59be8..3fcf31ac1028 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -908,12 +908,8 @@ impl TenantShard { .generation .expect("Attempted to enter attached state without a generation"); - let wanted_conf = attached_location_conf( - generation, - &self.shard, - &self.config, - !self.intent.secondary.is_empty(), - ); + let wanted_conf = + attached_location_conf(generation, &self.shard, &self.config, &self.policy); match self.observed.locations.get(&node_id) { Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {} Some(_) | None => { @@ -1099,6 +1095,7 @@ impl TenantShard { let mut reconciler = Reconciler { tenant_shard_id: self.tenant_shard_id, shard: self.shard, + placement_policy: self.policy.clone(), generation: self.generation, intent: reconciler_intent, detach, diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs index 4eb8580e32cf..f687b24320ce 100644 --- a/storage_scrubber/src/checks.rs +++ b/storage_scrubber/src/checks.rs @@ -259,7 +259,7 @@ pub(crate) enum BlobDataParseResult { Incorrect(Vec), } -fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { +pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> { match name.rsplit_once('-') { // FIXME: this is gross, just use a regex? Some((layer_filename, gen)) if gen.len() == 8 => { diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs new file mode 100644 index 000000000000..2ef802229d1d --- /dev/null +++ b/storage_scrubber/src/find_large_objects.rs @@ -0,0 +1,120 @@ +use futures::{StreamExt, TryStreamExt}; +use pageserver::tenant::storage_layer::LayerName; +use serde::{Deserialize, Serialize}; + +use crate::{ + checks::parse_layer_object_name, init_remote, list_objects_with_retries, + metadata_stream::stream_tenants, BucketConfig, NodeKind, +}; + +#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)] +enum LargeObjectKind { + DeltaLayer, + ImageLayer, + Other, +} + +impl LargeObjectKind { + fn from_key(key: &str) -> Self { + let fname = key.split('/').last().unwrap(); + + let Ok((layer_name, _generation)) = parse_layer_object_name(fname) else { + return LargeObjectKind::Other; + }; + + match layer_name { + LayerName::Image(_) => LargeObjectKind::ImageLayer, + LayerName::Delta(_) => LargeObjectKind::DeltaLayer, + } + } +} + +#[derive(Serialize, Deserialize, Clone)] +pub struct LargeObject { + pub key: String, + pub size: u64, + kind: LargeObjectKind, +} + +#[derive(Serialize, Deserialize)] +pub struct LargeObjectListing { + pub objects: Vec, +} + +pub async fn find_large_objects( + bucket_config: BucketConfig, + min_size: u64, + ignore_deltas: bool, + concurrency: usize, +) -> anyhow::Result { + let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?; + let tenants = std::pin::pin!(stream_tenants(&s3_client, &target)); + + let objects_stream = tenants.map_ok(|tenant_shard_id| { + let mut tenant_root = target.tenant_root(&tenant_shard_id); + let s3_client = s3_client.clone(); + async move { + let mut objects = Vec::new(); + let mut total_objects_ctr = 0u64; + // We want the objects and not just common prefixes + tenant_root.delimiter.clear(); + let mut continuation_token = None; + loop { + let fetch_response = + list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone()) + .await?; + for obj in fetch_response.contents().iter().filter(|o| { + if let Some(obj_size) = o.size { + min_size as i64 <= obj_size + } else { + false + } + }) { + let key = obj.key().expect("couldn't get key").to_owned(); + let kind = LargeObjectKind::from_key(&key); + if ignore_deltas && kind == LargeObjectKind::DeltaLayer { + continue; + } + objects.push(LargeObject { + key, + size: obj.size.unwrap() as u64, + kind, + }) + } + total_objects_ctr += fetch_response.contents().len() as u64; + match fetch_response.next_continuation_token { + Some(new_token) => continuation_token = Some(new_token), + None => break, + } + } + + Ok((tenant_shard_id, objects, total_objects_ctr)) + } + }); + let mut objects_stream = std::pin::pin!(objects_stream.try_buffer_unordered(concurrency)); + + let mut objects = Vec::new(); + + let mut tenant_ctr = 0u64; + let mut object_ctr = 0u64; + while let Some(res) = objects_stream.next().await { + let (tenant_shard_id, objects_slice, total_objects_ctr) = res?; + objects.extend_from_slice(&objects_slice); + + object_ctr += total_objects_ctr; + tenant_ctr += 1; + if tenant_ctr % 100 == 0 { + tracing::info!( + "Scanned {tenant_ctr} shards. objects={object_ctr}, found={}, current={tenant_shard_id}.", + objects.len() + ); + } + } + + let bucket_name = target.bucket_name(); + tracing::info!( + "Scan of {bucket_name} finished. Scanned {tenant_ctr} shards. objects={object_ctr}, found={}.", + objects.len() + ); + Ok(LargeObjectListing { objects }) +} diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs index ce0ff10ec6e1..04508519881e 100644 --- a/storage_scrubber/src/garbage.rs +++ b/storage_scrubber/src/garbage.rs @@ -140,7 +140,7 @@ async fn find_garbage_inner( node_kind: NodeKind, ) -> anyhow::Result { // Construct clients for S3 and for Console API - let (s3_client, target) = init_remote(bucket_config.clone(), node_kind)?; + let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?; let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config)); // Build a set of console-known tenants, for quickly eliminating known-active tenants without having @@ -432,7 +432,7 @@ pub async fn purge_garbage( ); let (s3_client, target) = - init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind)?; + init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?; // Sanity checks on the incoming list if garbage_list.active_tenant_count == 0 { diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 64273432fc0c..9102ad9906f2 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -2,6 +2,7 @@ #![deny(clippy::undocumented_unsafe_blocks)] pub mod checks; pub mod cloud_admin_api; +pub mod find_large_objects; pub mod garbage; pub mod metadata_stream; pub mod pageserver_physical_gc; @@ -14,17 +15,10 @@ use std::fmt::Display; use std::sync::Arc; use std::time::Duration; -use anyhow::Context; -use aws_config::environment::EnvironmentVariableCredentialsProvider; -use aws_config::imds::credentials::ImdsCredentialsProvider; -use aws_config::meta::credentials::CredentialsProviderChain; -use aws_config::profile::ProfileFileCredentialsProvider; -use aws_config::retry::RetryConfig; -use aws_config::sso::SsoCredentialsProvider; -use aws_config::BehaviorVersion; -use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep}; -use aws_sdk_s3::{Client, Config}; -use aws_smithy_async::rt::sleep::TokioSleep; +use anyhow::{anyhow, Context}; +use aws_sdk_s3::config::Region; +use aws_sdk_s3::error::DisplayErrorContext; +use aws_sdk_s3::Client; use camino::{Utf8Path, Utf8PathBuf}; use clap::ValueEnum; @@ -241,85 +235,53 @@ impl ConsoleConfig { } } -pub fn init_logging(file_name: &str) -> WorkerGuard { - let (file_writer, guard) = - tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name)); - - let file_logs = fmt::Layer::new() - .with_target(false) - .with_ansi(false) - .with_writer(file_writer); +pub fn init_logging(file_name: &str) -> Option { let stderr_logs = fmt::Layer::new() .with_target(false) .with_writer(std::io::stderr); - tracing_subscriber::registry() - .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))) - .with(file_logs) - .with(stderr_logs) - .init(); - - guard -} -pub fn init_s3_client(bucket_region: Region) -> Client { - let credentials_provider = { - // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY" - let chain = CredentialsProviderChain::first_try( - "env", - EnvironmentVariableCredentialsProvider::new(), - ) - // uses "AWS_PROFILE" / `aws sso login --profile ` - .or_else( - "profile-sso", - ProfileFileCredentialsProvider::builder().build(), - ); - - // Use SSO if we were given an account ID - match std::env::var("SSO_ACCOUNT_ID").ok() { - Some(sso_account) => chain.or_else( - "sso", - SsoCredentialsProvider::builder() - .account_id(sso_account) - .role_name("PowerUserAccess") - .start_url("https://neondb.awsapps.com/start") - .region(bucket_region.clone()) - .build(), - ), - None => chain, - } - .or_else( - // Finally try IMDS - "imds", - ImdsCredentialsProvider::builder().build(), - ) + let disable_file_logging = match std::env::var("PAGESERVER_DISABLE_FILE_LOGGING") { + Ok(s) => s == "1" || s.to_lowercase() == "true", + Err(_) => false, }; - let sleep_impl: Arc = Arc::new(TokioSleep::new()); - - let mut builder = Config::builder() - .behavior_version( - #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */ - BehaviorVersion::v2023_11_09(), - ) - .region(bucket_region) - .retry_config(RetryConfig::adaptive().with_max_attempts(3)) - .sleep_impl(SharedAsyncSleep::from(sleep_impl)) - .credentials_provider(credentials_provider); - - if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") { - builder = builder.endpoint_url(endpoint) + if disable_file_logging { + tracing_subscriber::registry() + .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))) + .with(stderr_logs) + .init(); + None + } else { + let (file_writer, guard) = + tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name)); + let file_logs = fmt::Layer::new() + .with_target(false) + .with_ansi(false) + .with_writer(file_writer); + tracing_subscriber::registry() + .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info"))) + .with(stderr_logs) + .with(file_logs) + .init(); + Some(guard) } +} - Client::from_conf(builder.build()) +pub async fn init_s3_client(bucket_region: Region) -> Client { + let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28()) + .region(bucket_region) + .load() + .await; + Client::new(&config) } -fn init_remote( +async fn init_remote( bucket_config: BucketConfig, node_kind: NodeKind, ) -> anyhow::Result<(Arc, RootTarget)> { let bucket_region = Region::new(bucket_config.region); let delimiter = "/".to_string(); - let s3_client = Arc::new(init_s3_client(bucket_region)); + let s3_client = Arc::new(init_s3_client(bucket_region).await); let s3_root = match node_kind { NodeKind::Pageserver => RootTarget::Pageserver(S3Target { @@ -344,7 +306,7 @@ async fn list_objects_with_retries( s3_target: &S3Target, continuation_token: Option, ) -> anyhow::Result { - for _ in 0..MAX_RETRIES { + for trial in 0..MAX_RETRIES { match s3_client .list_objects_v2() .bucket(&s3_target.bucket_name) @@ -356,16 +318,22 @@ async fn list_objects_with_retries( { Ok(response) => return Ok(response), Err(e) => { + if trial == MAX_RETRIES - 1 { + return Err(e) + .with_context(|| format!("Failed to list objects {MAX_RETRIES} times")); + } error!( - "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}", - s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter + "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}", + s3_target.bucket_name, + s3_target.prefix_in_bucket, + s3_target.delimiter, + DisplayErrorContext(e), ); tokio::time::sleep(Duration::from_secs(1)).await; } } } - - anyhow::bail!("Failed to list objects {MAX_RETRIES} times") + Err(anyhow!("unreachable unless MAX_RETRIES==0")) } async fn download_object_with_retries( diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs index 222bd10ed248..d81612119263 100644 --- a/storage_scrubber/src/main.rs +++ b/storage_scrubber/src/main.rs @@ -1,6 +1,7 @@ use anyhow::bail; use camino::Utf8PathBuf; use pageserver_api::shard::TenantShardId; +use storage_scrubber::find_large_objects; use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode}; use storage_scrubber::pageserver_physical_gc::GcMode; use storage_scrubber::scan_pageserver_metadata::scan_metadata; @@ -72,6 +73,14 @@ enum Command { #[arg(short, long, default_value_t = GcMode::IndicesOnly)] mode: GcMode, }, + FindLargeObjects { + #[arg(long = "min-size")] + min_size: u64, + #[arg(short, long, default_value_t = false)] + ignore_deltas: bool, + #[arg(long = "concurrency", short = 'j', default_value_t = 64)] + concurrency: usize, + }, } #[tokio::main] @@ -86,6 +95,7 @@ async fn main() -> anyhow::Result<()> { Command::PurgeGarbage { .. } => "purge-garbage", Command::TenantSnapshot { .. } => "tenant-snapshot", Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc", + Command::FindLargeObjects { .. } => "find-large-objects", }; let _guard = init_logging(&format!( "{}_{}_{}_{}.log", @@ -186,7 +196,7 @@ async fn main() -> anyhow::Result<()> { concurrency, } => { let downloader = - SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?; + SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency).await?; downloader.download().await } Command::PageserverPhysicalGc { @@ -199,5 +209,20 @@ async fn main() -> anyhow::Result<()> { println!("{}", serde_json::to_string(&summary).unwrap()); Ok(()) } + Command::FindLargeObjects { + min_size, + ignore_deltas, + concurrency, + } => { + let summary = find_large_objects::find_large_objects( + bucket_config, + min_size, + ignore_deltas, + concurrency, + ) + .await?; + println!("{}", serde_json::to_string(&summary).unwrap()); + Ok(()) + } } } diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs index 014643312807..fb8fbc1635ae 100644 --- a/storage_scrubber/src/pageserver_physical_gc.rs +++ b/storage_scrubber/src/pageserver_physical_gc.rs @@ -160,7 +160,7 @@ pub async fn pageserver_physical_gc( min_age: Duration, mode: GcMode, ) -> anyhow::Result { - let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?; + let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?; let tenants = if tenant_ids.is_empty() { futures::future::Either::Left(stream_tenants(&s3_client, &target)) diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs index af74ffa4cdbd..df4f29acf72b 100644 --- a/storage_scrubber/src/scan_pageserver_metadata.rs +++ b/storage_scrubber/src/scan_pageserver_metadata.rs @@ -199,7 +199,7 @@ pub async fn scan_metadata( bucket_config: BucketConfig, tenant_ids: Vec, ) -> anyhow::Result { - let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?; + let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?; let tenants = if tenant_ids.is_empty() { futures::future::Either::Left(stream_tenants(&s3_client, &target)) diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs index 24051b03de08..553adf8f468e 100644 --- a/storage_scrubber/src/scan_safekeeper_metadata.rs +++ b/storage_scrubber/src/scan_safekeeper_metadata.rs @@ -106,7 +106,7 @@ pub async fn scan_safekeeper_metadata( let timelines = client.query(&query, &[]).await?; info!("loaded {} timelines", timelines.len()); - let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?; + let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?; let console_config = ConsoleConfig::from_env()?; let cloud_admin_api_client = CloudAdminApiClient::new(console_config); diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs index 450b337235f0..5a75f8d40ecf 100644 --- a/storage_scrubber/src/tenant_snapshot.rs +++ b/storage_scrubber/src/tenant_snapshot.rs @@ -28,13 +28,13 @@ pub struct SnapshotDownloader { } impl SnapshotDownloader { - pub fn new( + pub async fn new( bucket_config: BucketConfig, tenant_id: TenantId, output_path: Utf8PathBuf, concurrency: usize, ) -> anyhow::Result { - let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?; + let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?; Ok(Self { s3_client, s3_root, @@ -215,7 +215,8 @@ impl SnapshotDownloader { } pub async fn download(&self) -> anyhow::Result<()> { - let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?; + let (s3_client, target) = + init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?; // Generate a stream of TenantShardId let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?; diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py index 41fa8e679f28..c019cbbc7790 100644 --- a/test_runner/fixtures/metrics.py +++ b/test_runner/fixtures/metrics.py @@ -144,6 +144,8 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]: "pageserver_smgr_query_seconds_bucket", "pageserver_smgr_query_seconds_count", "pageserver_smgr_query_seconds_sum", + "pageserver_archive_size", + "pageserver_pitr_history_size", "pageserver_storage_operations_seconds_count_total", "pageserver_storage_operations_seconds_sum_total", "pageserver_evictions_total", diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py new file mode 100644 index 000000000000..39baf5fab69f --- /dev/null +++ b/test_runner/fixtures/neon_api.py @@ -0,0 +1,263 @@ +from __future__ import annotations + +import time +from typing import TYPE_CHECKING, cast + +import requests + +if TYPE_CHECKING: + from typing import Any, Dict, Literal, Optional, Union + + from fixtures.pg_version import PgVersion + + +def connection_parameters_to_env(params: Dict[str, str]) -> Dict[str, str]: + return { + "PGHOST": params["host"], + "PGDATABASE": params["database"], + "PGUSER": params["role"], + "PGPASSWORD": params["password"], + } + + +class NeonAPI: + def __init__(self, neon_api_key: str, neon_api_base_url: str): + self.__neon_api_key = neon_api_key + self.__neon_api_base_url = neon_api_base_url.strip("/") + + def __request( + self, method: Union[str, bytes], endpoint: str, **kwargs: Any + ) -> requests.Response: + if "headers" not in kwargs: + kwargs["headers"] = {} + kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}" + + return requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs) + + def create_project( + self, + pg_version: Optional[PgVersion] = None, + name: Optional[str] = None, + branch_name: Optional[str] = None, + branch_role_name: Optional[str] = None, + branch_database_name: Optional[str] = None, + ) -> Dict[str, Any]: + data: Dict[str, Any] = { + "project": { + "branch": {}, + }, + } + if name: + data["project"]["name"] = name + if pg_version: + data["project"]["pg_version"] = int(pg_version) + if branch_name: + data["project"]["branch"]["name"] = branch_name + if branch_role_name: + data["project"]["branch"]["role_name"] = branch_role_name + if branch_database_name: + data["project"]["branch"]["database_name"] = branch_database_name + + resp = self.__request( + "POST", + "/projects", + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + }, + json=data, + ) + + assert resp.status_code == 201 + + return cast("Dict[str, Any]", resp.json()) + + def get_project_details(self, project_id: str) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}", + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + }, + ) + assert resp.status_code == 200 + return cast("Dict[str, Any]", resp.json()) + + def delete_project( + self, + project_id: str, + ) -> Dict[str, Any]: + resp = self.__request( + "DELETE", + f"/projects/{project_id}", + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def start_endpoint( + self, + project_id: str, + endpoint_id: str, + ) -> Dict[str, Any]: + resp = self.__request( + "POST", + f"/projects/{project_id}/endpoints/{endpoint_id}/start", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def suspend_endpoint( + self, + project_id: str, + endpoint_id: str, + ) -> Dict[str, Any]: + resp = self.__request( + "POST", + f"/projects/{project_id}/endpoints/{endpoint_id}/suspend", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def restart_endpoint( + self, + project_id: str, + endpoint_id: str, + ) -> Dict[str, Any]: + resp = self.__request( + "POST", + f"/projects/{project_id}/endpoints/{endpoint_id}/restart", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def create_endpoint( + self, + project_id: str, + branch_id: str, + endpoint_type: Literal["read_write", "read_only"], + settings: Dict[str, Any], + ) -> Dict[str, Any]: + data: Dict[str, Any] = { + "endpoint": { + "branch_id": branch_id, + }, + } + + if endpoint_type: + data["endpoint"]["type"] = endpoint_type + if settings: + data["endpoint"]["settings"] = settings + + resp = self.__request( + "POST", + f"/projects/{project_id}/endpoints", + headers={ + "Accept": "application/json", + "Content-Type": "application/json", + }, + json=data, + ) + + assert resp.status_code == 201 + + return cast("Dict[str, Any]", resp.json()) + + def get_connection_uri( + self, + project_id: str, + branch_id: Optional[str] = None, + endpoint_id: Optional[str] = None, + database_name: str = "neondb", + role_name: str = "neondb_owner", + pooled: bool = True, + ) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}/connection_uri", + params={ + "branch_id": branch_id, + "endpoint_id": endpoint_id, + "database_name": database_name, + "role_name": role_name, + "pooled": pooled, + }, + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def get_branches(self, project_id: str) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}/branches", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def get_endpoints(self, project_id: str) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}/endpoints", + headers={ + "Accept": "application/json", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def get_operations(self, project_id: str) -> Dict[str, Any]: + resp = self.__request( + "GET", + f"/projects/{project_id}/operations", + headers={ + "Accept": "application/json", + "Authorization": f"Bearer {self.__neon_api_key}", + }, + ) + + assert resp.status_code == 200 + + return cast("Dict[str, Any]", resp.json()) + + def wait_for_operation_to_finish(self, project_id: str): + has_running = True + while has_running: + has_running = False + operations = self.get_operations(project_id)["operations"] + for op in operations: + if op["status"] in {"scheduling", "running", "cancelling"}: + has_running = True + time.sleep(0.5) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 6bfe1afd1f2a..5ca31644a910 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -87,6 +87,8 @@ ) from fixtures.utils import AuxFileStore as AuxFileStore # reexport +from .neon_api import NeonAPI + """ This file contains pytest fixtures. A fixture is a test resource that can be summoned by placing its name in the test's arguments. @@ -184,6 +186,25 @@ def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Ite yield versioned_dir +@pytest.fixture(scope="session") +def neon_api_key() -> str: + api_key = os.getenv("NEON_API_KEY") + if not api_key: + raise AssertionError("Set the NEON_API_KEY environment variable") + + return api_key + + +@pytest.fixture(scope="session") +def neon_api_base_url() -> str: + return os.getenv("NEON_API_BASE_URL", "https://console-stage.neon.build/api/v2") + + +@pytest.fixture(scope="session") +def neon_api(neon_api_key: str, neon_api_base_url: str) -> NeonAPI: + return NeonAPI(neon_api_key, neon_api_base_url) + + def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]: """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar. @@ -471,6 +492,7 @@ def __init__( pageserver_virtual_file_io_engine: Optional[str] = None, pageserver_aux_file_policy: Optional[AuxFileStore] = None, pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None, + safekeeper_extra_opts: Optional[list[str]] = None, ): self.repo_dir = repo_dir self.rust_log_override = rust_log_override @@ -536,6 +558,8 @@ def __init__( self.pageserver_aux_file_policy = pageserver_aux_file_policy + self.safekeeper_extra_opts = safekeeper_extra_opts + assert test_name.startswith( "test_" ), "Unexpectedly instantiated from outside a test function" @@ -943,6 +967,8 @@ def __exit__( # if the test threw an exception, don't check for errors # as a failing assertion would cause the cleanup below to fail ps_assert_metric_no_errors=(exc_type is None), + # do not fail on endpoint errors to allow the rest of cleanup to proceed + fail_on_endpoint_errors=False, ) cleanup_error = None @@ -1167,8 +1193,12 @@ def __init__(self, config: NeonEnvBuilder): if config.auth_enabled: sk_cfg["auth_enabled"] = True if self.safekeepers_remote_storage is not None: - sk_cfg["remote_storage"] = self.safekeepers_remote_storage.to_toml_inline_table() - self.safekeepers.append(Safekeeper(env=self, id=id, port=port)) + sk_cfg[ + "remote_storage" + ] = self.safekeepers_remote_storage.to_toml_inline_table().strip() + self.safekeepers.append( + Safekeeper(env=self, id=id, port=port, extra_opts=config.safekeeper_extra_opts) + ) cfg["safekeepers"].append(sk_cfg) log.info(f"Config: {cfg}") @@ -1212,11 +1242,11 @@ def start(self, timeout_in_seconds: Optional[int] = None): for f in futs: f.result() - def stop(self, immediate=False, ps_assert_metric_no_errors=False): + def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True): """ After this method returns, there should be no child processes running. """ - self.endpoints.stop_all() + self.endpoints.stop_all(fail_on_endpoint_errors) # Stop storage controller before pageservers: we don't want it to spuriously # detect a pageserver "failure" during test teardown @@ -1933,6 +1963,7 @@ def endpoint_reconfigure( endpoint_id: str, tenant_id: Optional[TenantId] = None, pageserver_id: Optional[int] = None, + safekeepers: Optional[List[int]] = None, check_return_code=True, ) -> "subprocess.CompletedProcess[str]": args = ["endpoint", "reconfigure", endpoint_id] @@ -1940,6 +1971,8 @@ def endpoint_reconfigure( args.extend(["--tenant-id", str(tenant_id)]) if pageserver_id is not None: args.extend(["--pageserver-id", str(pageserver_id)]) + if safekeepers is not None: + args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) return self.raw_cli(args, check_return_code=check_return_code) def endpoint_stop( @@ -2108,6 +2141,21 @@ def stop(self, immediate: bool = False) -> "NeonStorageController": self.running = False return self + @staticmethod + def retryable_node_operation(op, ps_id, max_attempts, backoff): + while max_attempts > 0: + try: + op(ps_id) + return + except StorageControllerApiException as e: + max_attempts -= 1 + log.info(f"Operation failed ({max_attempts} attempts left): {e}") + + if max_attempts == 0: + raise e + + time.sleep(backoff) + @staticmethod def raise_api_exception(res: requests.Response): try: @@ -2448,6 +2496,38 @@ def consistency_check(self): ) log.info("storage controller passed consistency check") + def poll_node_status( + self, node_id: int, desired_scheduling_policy: str, max_attempts: int, backoff: int + ): + """ + Poll the node status until it reaches 'desired_scheduling_policy' or 'max_attempts' have been exhausted + """ + log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy") + while max_attempts > 0: + try: + status = self.node_status(node_id) + policy = status["scheduling"] + if policy == desired_scheduling_policy: + return + else: + max_attempts -= 1 + log.info(f"Status call returned {policy=} ({max_attempts} attempts left)") + + if max_attempts == 0: + raise AssertionError( + f"Status for {node_id=} did not reach {desired_scheduling_policy=}" + ) + + time.sleep(backoff) + except StorageControllerApiException as e: + max_attempts -= 1 + log.info(f"Status call failed ({max_attempts} retries left): {e}") + + if max_attempts == 0: + raise e + + time.sleep(backoff) + def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]): if isinstance(config_strings, tuple): pairs = [config_strings] @@ -2738,7 +2818,19 @@ def tenant_create( if generation is None: generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id) client = self.http_client(auth_token=auth_token) - return client.tenant_create(tenant_id, conf, generation=generation) + + conf = conf or {} + + client.tenant_location_conf( + tenant_id, + { + "mode": "AttachedSingle", + "generation": generation, + "tenant_conf": conf, + "secondary_conf": None, + }, + ) + return tenant_id def list_layers( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId @@ -2796,14 +2888,21 @@ def _build_env(self, env_add: Optional[Env]) -> Env: env.update(env_add) return env - def run( + def _log_env(self, env: dict[str, str]) -> None: + env_s = {} + for k, v in env.items(): + if k.startswith("PG") and k != "PGPASSWORD": + env_s[k] = v + log.debug(f"Environment: {env_s}") + + def run_nonblocking( self, command: List[str], env: Optional[Env] = None, cwd: Optional[Union[str, Path]] = None, - ): + ) -> subprocess.Popen[Any]: """ - Run one of the postgres binaries. + Run one of the postgres binaries, not waiting for it to finish The command should be in list form, e.g. ['pgbench', '-p', '55432'] @@ -2814,11 +2913,34 @@ def run( If you want stdout/stderr captured to files, use `run_capture` instead. """ - self._fixpath(command) log.info(f"Running command '{' '.join(command)}'") env = self._build_env(env) - subprocess.run(command, env=env, cwd=cwd, check=True) + self._log_env(env) + return subprocess.Popen(command, env=env, cwd=cwd, stdout=subprocess.PIPE, text=True) + + def run( + self, + command: List[str], + env: Optional[Env] = None, + cwd: Optional[Union[str, Path]] = None, + ) -> None: + """ + Run one of the postgres binaries, waiting for it to finish + + The command should be in list form, e.g. ['pgbench', '-p', '55432'] + + All the necessary environment variables will be set. + + If the first argument (the command name) doesn't include a path (no '/' + characters present), then it will be edited to include the correct path. + + If you want stdout/stderr captured to files, use `run_capture` instead. + """ + proc = self.run_nonblocking(command, env, cwd) + proc.wait() + if proc.returncode != 0: + raise subprocess.CalledProcessError(proc.returncode, proc.args) def run_capture( self, @@ -2838,6 +2960,7 @@ def run_capture( self._fixpath(command) log.info(f"Running command '{' '.join(command)}'") env = self._build_env(env) + self._log_env(env) base_path, _, _ = subprocess_capture( self.log_dir, command, @@ -3476,7 +3599,6 @@ def __init__( ): super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres") self.env = env - self.running = False self.branch_name: Optional[str] = None # dubious self.endpoint_id: Optional[str] = None # dubious, see asserts below self.pgdata_dir: Optional[str] = None # Path to computenode PGDATA @@ -3484,6 +3606,7 @@ def __init__( self.pg_port = pg_port self.http_port = http_port self.check_stop_result = check_stop_result + # passed to endpoint create and endpoint reconfigure self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers)) # path to conf is /endpoints//pgdata/postgresql.conf @@ -3552,6 +3675,7 @@ def start( self, remote_ext_config: Optional[str] = None, pageserver_id: Optional[int] = None, + safekeepers: Optional[List[int]] = None, allow_multiple: bool = False, ) -> "Endpoint": """ @@ -3561,6 +3685,11 @@ def start( assert self.endpoint_id is not None + # If `safekeepers` is not None, they are remember them as active and use + # in the following commands. + if safekeepers is not None: + self.active_safekeepers = safekeepers + log.info(f"Starting postgres endpoint {self.endpoint_id}") self.env.neon_cli.endpoint_start( @@ -3624,9 +3753,17 @@ def edit_hba(self, hba: List[str]): def is_running(self): return self._running._value > 0 - def reconfigure(self, pageserver_id: Optional[int] = None): + def reconfigure( + self, pageserver_id: Optional[int] = None, safekeepers: Optional[List[int]] = None + ): assert self.endpoint_id is not None - self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id) + # If `safekeepers` is not None, they are remember them as active and use + # in the following commands. + if safekeepers is not None: + self.active_safekeepers = safekeepers + self.env.neon_cli.endpoint_reconfigure( + self.endpoint_id, self.tenant_id, pageserver_id, self.active_safekeepers + ) def respec(self, **kwargs): """Update the endpoint.json file used by control_plane.""" @@ -3821,13 +3958,23 @@ def create( pageserver_id=pageserver_id, ) - def stop_all(self) -> "EndpointFactory": + def stop_all(self, fail_on_error=True) -> "EndpointFactory": + exception = None for ep in self.endpoints: - ep.stop() + try: + ep.stop() + except Exception as e: + log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}") + exception = e + + if fail_on_error and exception is not None: + raise exception return self - def new_replica(self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]]): + def new_replica( + self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]] = None + ): branch_name = origin.branch_name assert origin in self.endpoints assert branch_name is not None @@ -3874,16 +4021,28 @@ class Safekeeper(LogUtils): id: int running: bool = False - def __init__(self, env: NeonEnv, port: SafekeeperPort, id: int, running: bool = False): + def __init__( + self, + env: NeonEnv, + port: SafekeeperPort, + id: int, + running: bool = False, + extra_opts: Optional[List[str]] = None, + ): self.env = env self.port = port self.id = id self.running = running self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log" + self.extra_opts = extra_opts def start( self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None ) -> "Safekeeper": + if extra_opts is None: + # Apply either the extra_opts passed in, or the ones from our constructor: we do not merge the two. + extra_opts = self.extra_opts + assert self.running is False self.env.neon_cli.safekeeper_start( self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 794961271418..03aee9e5c597 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -220,34 +220,6 @@ def tenant_list(self) -> List[Dict[Any, Any]]: assert isinstance(res_json, list) return res_json - def tenant_create( - self, - new_tenant_id: Union[TenantId, TenantShardId], - conf: Optional[Dict[str, Any]] = None, - generation: Optional[int] = None, - ) -> TenantId: - if conf is not None: - assert "new_tenant_id" not in conf.keys() - - body: Dict[str, Any] = { - "new_tenant_id": str(new_tenant_id), - **(conf or {}), - } - - if generation is not None: - body.update({"generation": generation}) - - res = self.post( - f"http://localhost:{self.port}/v1/tenant", - json=body, - ) - self.verbose_error(res) - if res.status_code == 409: - raise Exception(f"could not create tenant: already exists for id {new_tenant_id}") - new_tenant_id = res.json() - assert isinstance(new_tenant_id, str) - return TenantId(new_tenant_id) - def tenant_attach( self, tenant_id: Union[TenantId, TenantShardId], @@ -627,6 +599,22 @@ def timeline_get_lsn_by_timestamp( res_json = res.json() return res_json + def timeline_lsn_lease( + self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn + ): + data = { + "lsn": str(lsn), + } + + log.info(f"Requesting lsn lease for {lsn=}, {tenant_id=}, {timeline_id=}") + res = self.post( + f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease", + json=data, + ) + self.verbose_error(res) + res_json = res.json() + return res_json + def timeline_get_timestamp_of_lsn( self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn ): diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py index 8730d8ef751d..c437258c6f87 100644 --- a/test_runner/fixtures/pageserver/many_tenants.py +++ b/test_runner/fixtures/pageserver/many_tenants.py @@ -42,10 +42,6 @@ def single_timeline( log.info("detach template tenant form pageserver") env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) log.info(f"duplicating template tenant {ncopies} times in S3") tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies) diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index 60535b759261..b75a480a637e 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -198,7 +198,7 @@ def wait_for_last_record_lsn( lsn: Lsn, ) -> Lsn: """waits for pageserver to catch up to a certain lsn, returns the last observed lsn.""" - for i in range(100): + for i in range(1000): current_lsn = last_record_lsn(pageserver_http, tenant, timeline) if current_lsn >= lsn: return current_lsn diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py index 0ff9c8fdaa98..33848b06d35c 100644 --- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py +++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py @@ -55,10 +55,6 @@ def setup_template(env: NeonEnv): } template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) env.pageserver.tenant_attach(template_tenant, config) ep = env.endpoints.create_start("main", tenant_id=template_tenant) ep.safe_psql("create table foo(b text)") diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index b66db4d0ab72..b41ae601975f 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -86,10 +86,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int): template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) env.pageserver.tenant_attach(template_tenant, config) ps_http = env.pageserver.http_client() diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 1d579214b0c5..60861cf939b8 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -1,4 +1,5 @@ import json +import os from pathlib import Path from typing import Any, Dict, Tuple @@ -17,30 +18,74 @@ setup_pageserver_with_tenants, ) +# The following tests use pagebench "getpage at latest LSN" to characterize the throughput of the pageserver. +# originally there was a single test named `test_pageserver_max_throughput_getpage_at_latest_lsn`` +# so you still see some references to this name in the code. +# To avoid recreating the snapshots for each test, we continue to use the name `max_throughput_latest_lsn` +# for some files and metrics. + + +# For reference, the space usage of the snapshots: +# sudo du -hs /instance_store/neon/test_output/shared-snapshots/* +# 416G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-500-13 +@pytest.mark.parametrize("duration", [60 * 60]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)]) +@pytest.mark.parametrize("n_tenants", [500]) +@pytest.mark.timeout(10000) +@pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI", +) +def test_pageserver_characterize_throughput_with_n_tenants( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + n_tenants: int, + pgbench_scale: int, + duration: int, +): + setup_and_run_pagebench_benchmark( + neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, 1 + ) + # For reference, the space usage of the snapshots: -# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots -# 137G /instance_store/test_output/shared-snapshots -# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/* -# 1.8G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13 -# 1.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6 -# 8.5G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13 -# 5.1G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6 -# 76G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13 -# 46G /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6 -@pytest.mark.parametrize("duration", [30]) -@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]]) -@pytest.mark.parametrize("n_tenants", [1, 10]) -@pytest.mark.timeout( - 10000 -) # TODO: this value is just "a really high number"; have this per instance type -def test_pageserver_max_throughput_getpage_at_latest_lsn( +# sudo du -hs /instance_store/neon/test_output/shared-snapshots/* +# 19G /instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-1-136 +@pytest.mark.parametrize("duration", [20 * 60]) +@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)]) +# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability +# we use 64 clients because typically for a high number of connections we recommend the connection pooler +# which by default uses 64 connections +@pytest.mark.parametrize("n_clients", [1, 64]) +@pytest.mark.parametrize("n_tenants", [1]) +@pytest.mark.timeout(2400) +@pytest.mark.skipif( + os.getenv("CI", "false") == "true", + reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI", +) +def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant( neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker, pg_bin: PgBin, n_tenants: int, pgbench_scale: int, duration: int, + n_clients: int, +): + setup_and_run_pagebench_benchmark( + neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients + ) + + +def setup_and_run_pagebench_benchmark( + neon_env_builder: NeonEnvBuilder, + zenbenchmark: NeonBenchmarker, + pg_bin: PgBin, + n_tenants: int, + pgbench_scale: int, + duration: int, + n_clients: int, ): def record(metric, **kwargs): zenbenchmark.record( @@ -55,6 +100,7 @@ def record(metric, **kwargs): "n_tenants": (n_tenants, {"unit": ""}), "pgbench_scale": (pgbench_scale, {"unit": ""}), "duration": (duration, {"unit": "s"}), + "n_clients": (n_clients, {"unit": ""}), } ) @@ -96,7 +142,7 @@ def setup_wrapper(env: NeonEnv): r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*" ) - run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration) + run_pagebench_benchmark(env, pg_bin, record, duration, n_clients) def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): @@ -118,10 +164,6 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): } template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) - env.pageserver.allowed_errors.append( - # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely - ".*Dropped remote consistent LSN updates.*", - ) env.pageserver.tenant_attach(template_tenant, config) ps_http = env.pageserver.http_client() with env.endpoints.create_start("main", tenant_id=template_tenant) as ep: @@ -157,8 +199,8 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): return (template_tenant, template_timeline, config) -def run_benchmark_max_throughput_latest_lsn( - env: NeonEnv, pg_bin: PgBin, record, duration_secs: int +def run_pagebench_benchmark( + env: NeonEnv, pg_bin: PgBin, record, duration_secs: int, n_clients: int ): """ Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`. @@ -172,6 +214,8 @@ def run_benchmark_max_throughput_latest_lsn( ps_http.base_url, "--page-service-connstring", env.pageserver.connstr(password=None), + "--num-clients", + str(n_clients), "--runtime", f"{duration_secs}s", # don't specify the targets explicitly, let pagebench auto-discover them diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py index 92e05663ce20..88296a7fbdec 100644 --- a/test_runner/performance/pageserver/util.py +++ b/test_runner/performance/pageserver/util.py @@ -22,7 +22,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int): log.info("wait for all tenants to become active") wait_until_all_tenants_state( - ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False + ps_http, "Active", iterations=10 + n_tenants, period=1, http_error_ok=False ) # ensure all layers are resident for predictiable performance diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py index 7d11facc2949..5ab83dd31d0b 100644 --- a/test_runner/performance/test_logical_replication.py +++ b/test_runner/performance/test_logical_replication.py @@ -1,8 +1,24 @@ +from __future__ import annotations + import time +import traceback +from typing import TYPE_CHECKING +import psycopg2 +import psycopg2.extras import pytest +from fixtures.benchmark_fixture import MetricReport +from fixtures.common_types import Lsn from fixtures.log_helper import log -from fixtures.neon_fixtures import AuxFileStore, NeonEnv, PgBin, logical_replication_sync +from fixtures.neon_api import connection_parameters_to_env +from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync +from fixtures.pg_version import PgVersion + +if TYPE_CHECKING: + from fixtures.benchmark_fixture import NeonBenchmarker + from fixtures.neon_api import NeonAPI + from fixtures.neon_fixtures import NeonEnv, PgBin + from fixtures.pg_version import PgVersion @pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2]) @@ -26,7 +42,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg vanilla_pg.safe_psql("truncate table pgbench_history") connstr = endpoint.connstr().replace("'", "''") - print(f"connstr='{connstr}'") vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1") # Wait logical replication channel to be established @@ -42,3 +57,286 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0] sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0] assert sum_master == sum_replica + + +def check_pgbench_still_running(pgbench, label=""): + rc = pgbench.poll() + if rc is not None: + raise RuntimeError(f"{label} pgbench terminated early with return code {rc}") + + +def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600): + start = time.time() + pub_cur.execute("SELECT pg_current_wal_flush_lsn()") + pub_lsn = Lsn(pub_cur.fetchall()[0][0]) + while (time.time() - start) < timeout_sec: + sub_cur.execute("SELECT latest_end_lsn FROM pg_catalog.pg_stat_subscription") + res = sub_cur.fetchall()[0][0] + if res: + log.info(f"subscriber_lsn={res}") + sub_lsn = Lsn(res) + log.info(f"Subscriber LSN={sub_lsn}, publisher LSN={pub_lsn}") + if sub_lsn >= pub_lsn: + return time.time() - start + time.sleep(0.5) + raise TimeoutError(f"Logical replication sync took more than {timeout_sec} sec") + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_subscriber_lag( + pg_bin: PgBin, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + """ + Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects + on subscriber. Periodically restarts subscriber while still running the inserts, and + measures how long sync takes after restart. + """ + test_duration_min = 60 + sync_interval_min = 5 + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + pub_project = neon_api.create_project(pg_version) + pub_project_id = pub_project["project"]["id"] + neon_api.wait_for_operation_to_finish(pub_project_id) + error_occurred = False + try: + sub_project = neon_api.create_project(pg_version) + sub_project_id = sub_project["project"]["id"] + sub_endpoint_id = sub_project["endpoints"][0]["id"] + neon_api.wait_for_operation_to_finish(sub_project_id) + try: + pub_env = connection_parameters_to_env( + pub_project["connection_uris"][0]["connection_parameters"] + ) + sub_env = connection_parameters_to_env( + sub_project["connection_uris"][0]["connection_parameters"] + ) + pub_connstr = pub_project["connection_uris"][0]["connection_uri"] + sub_connstr = sub_project["connection_uris"][0]["connection_uri"] + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + sub_cur.execute("truncate table pgbench_accounts") + sub_cur.execute("truncate table pgbench_history") + + pub_cur.execute( + "create publication pub1 for table pgbench_accounts, pgbench_history" + ) + sub_cur.execute( + f"create subscription sub1 connection '{pub_connstr}' publication pub1" + ) + + initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn.close() + sub_conn.close() + + zenbenchmark.record( + "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER + ) + + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env + ) + try: + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) + try: + start = time.time() + while time.time() - start < test_duration_min * 60: + time.sleep(sync_interval_min * 60) + check_pgbench_still_running(pub_workload, "pub") + check_pgbench_still_running(sub_workload, "sub") + + with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( + sub_connstr + ) as sub_conn: + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) + + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + sub_workload.terminate() + neon_api.restart_endpoint( + sub_project_id, + sub_endpoint_id, + ) + neon_api.wait_for_operation_to_finish(sub_project_id) + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) + + # Measure storage to make sure replication information isn't bloating storage + sub_storage = neon_api.get_project_details(sub_project_id)["project"][ + "synthetic_storage_size" + ] + pub_storage = neon_api.get_project_details(pub_project_id)["project"][ + "synthetic_storage_size" + ] + zenbenchmark.record( + "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER + ) + zenbenchmark.record( + "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER + ) + + finally: + sub_workload.terminate() + finally: + pub_workload.terminate() + except Exception as e: + error_occurred = True + log.error(f"Caught exception {e}") + log.error(traceback.format_exc()) + finally: + if not error_occurred: + neon_api.delete_project(sub_project_id) + except Exception as e: + error_occurred = True + log.error(f"Caught exception {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred + neon_api.delete_project(pub_project_id) + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_publisher_restart( + pg_bin: PgBin, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + """ + Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects + on subscriber. Periodically restarts publisher (to exercise on-demand WAL download), and + measures how long sync takes after restart. + """ + test_duration_min = 60 + sync_interval_min = 5 + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + pub_project = neon_api.create_project(pg_version) + pub_project_id = pub_project["project"]["id"] + pub_endpoint_id = pub_project["endpoints"][0]["id"] + neon_api.wait_for_operation_to_finish(pub_project_id) + error_occurred = False + try: + sub_project = neon_api.create_project(pg_version) + sub_project_id = sub_project["project"]["id"] + neon_api.wait_for_operation_to_finish(sub_project_id) + try: + pub_env = connection_parameters_to_env( + pub_project["connection_uris"][0]["connection_parameters"] + ) + sub_env = connection_parameters_to_env( + sub_project["connection_uris"][0]["connection_parameters"] + ) + pub_connstr = pub_project["connection_uris"][0]["connection_uri"] + sub_connstr = sub_project["connection_uris"][0]["connection_uri"] + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env) + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env) + + pub_conn = psycopg2.connect(pub_connstr) + sub_conn = psycopg2.connect(sub_connstr) + pub_conn.autocommit = True + sub_conn.autocommit = True + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + sub_cur.execute("truncate table pgbench_accounts") + sub_cur.execute("truncate table pgbench_history") + + pub_cur.execute( + "create publication pub1 for table pgbench_accounts, pgbench_history" + ) + sub_cur.execute( + f"create subscription sub1 connection '{pub_connstr}' publication pub1" + ) + + initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur) + pub_conn.close() + sub_conn.close() + + zenbenchmark.record( + "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER + ) + + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env + ) + try: + sub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=sub_env, + ) + try: + start = time.time() + while time.time() - start < test_duration_min * 60: + time.sleep(sync_interval_min * 60) + check_pgbench_still_running(pub_workload, "pub") + check_pgbench_still_running(sub_workload, "sub") + + pub_workload.terminate() + neon_api.restart_endpoint( + pub_project_id, + pub_endpoint_id, + ) + neon_api.wait_for_operation_to_finish(pub_project_id) + pub_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], + env=pub_env, + ) + with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect( + sub_connstr + ) as sub_conn: + with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur: + lag = measure_logical_replication_lag(sub_cur, pub_cur) + + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + + # Measure storage to make sure replication information isn't bloating storage + sub_storage = neon_api.get_project_details(sub_project_id)["project"][ + "synthetic_storage_size" + ] + pub_storage = neon_api.get_project_details(pub_project_id)["project"][ + "synthetic_storage_size" + ] + zenbenchmark.record( + "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER + ) + zenbenchmark.record( + "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER + ) + + finally: + sub_workload.terminate() + finally: + pub_workload.terminate() + except Exception as e: + error_occurred = True + log.error(f"Caught exception {e}") + log.error(traceback.format_exc()) + finally: + if not error_occurred: + neon_api.delete_project(sub_project_id) + except Exception as e: + error_occurred = True + log.error(f"Caught exception {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred + neon_api.delete_project(pub_project_id) diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py new file mode 100644 index 000000000000..7e1619721144 --- /dev/null +++ b/test_runner/performance/test_physical_replication.py @@ -0,0 +1,296 @@ +from __future__ import annotations + +import csv +import os +import subprocess +import time +import traceback +from pathlib import Path +from typing import TYPE_CHECKING + +import psycopg2 +import psycopg2.extras +import pytest +from fixtures.benchmark_fixture import MetricReport +from fixtures.common_types import Lsn +from fixtures.log_helper import log +from fixtures.neon_api import connection_parameters_to_env +from fixtures.pg_version import PgVersion + +if TYPE_CHECKING: + from typing import Any, List, Optional + + from fixtures.benchmark_fixture import NeonBenchmarker + from fixtures.neon_api import NeonAPI + from fixtures.neon_fixtures import PgBin + + +# Granularity of ~0.5 sec +def measure_replication_lag(master, replica, timeout_sec=600): + start = time.time() + master.execute("SELECT pg_current_wal_flush_lsn()") + master_lsn = Lsn(master.fetchall()[0][0]) + while (time.time() - start) < timeout_sec: + replica.execute("select pg_last_wal_replay_lsn()") + replica_lsn = replica.fetchall()[0][0] + if replica_lsn: + if Lsn(replica_lsn) >= master_lsn: + return time.time() - start + time.sleep(0.5) + raise TimeoutError(f"Replication sync took more than {timeout_sec} sec") + + +def check_pgbench_still_running(pgbench): + rc = pgbench.poll() + if rc is not None: + raise RuntimeError(f"Pgbench terminated early with return code {rc}") + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_ro_replica_lag( + pg_bin: PgBin, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + test_duration_min = 60 + sync_interval_min = 10 + + pgbench_duration = f"-T{test_duration_min * 60 * 2}" + + project = neon_api.create_project(pg_version) + project_id = project["project"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + error_occurred = False + try: + branch_id = project["branch"]["id"] + master_connstr = project["connection_uris"][0]["connection_uri"] + master_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + + replica = neon_api.create_endpoint( + project_id, + branch_id, + endpoint_type="read_only", + settings={"pg_settings": {"hot_standby_feedback": "on"}}, + ) + replica_env = master_env.copy() + replica_env["PGHOST"] = replica["endpoint"]["host"] + neon_api.wait_for_operation_to_finish(project_id) + + replica_connstr = neon_api.get_connection_uri( + project_id, + endpoint_id=replica["endpoint"]["id"], + )["uri"] + + pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env) + + master_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-Mprepared"], + env=master_env, + ) + try: + replica_workload = pg_bin.run_nonblocking( + ["pgbench", "-c10", pgbench_duration, "-S"], + env=replica_env, + ) + try: + start = time.time() + while time.time() - start < test_duration_min * 60: + check_pgbench_still_running(master_workload) + check_pgbench_still_running(replica_workload) + time.sleep(sync_interval_min * 60) + with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect( + replica_connstr + ) as conn_replica: + with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica: + lag = measure_replication_lag(cur_master, cur_replica) + log.info(f"Replica lagged behind master by {lag} seconds") + zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER) + finally: + replica_workload.terminate() + finally: + master_workload.terminate() + except Exception as e: + error_occurred = True + log.error(f"Caught exception: {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred # Fail the test if an error occurred + neon_api.delete_project(project_id) + + +def report_pgbench_aggregate_intervals( + output_dir: Path, + prefix: str, + zenbenchmark: NeonBenchmarker, +): + for filename in os.listdir(output_dir): + if filename.startswith(prefix): + # The file will be in the form _. + # So we first lop off the ., and then lop off the prefix and the _ + node = filename.split(".")[0][len(prefix) + 1 :] + with open(output_dir / filename) as f: + reader = csv.reader(f, delimiter=" ") + for line in reader: + num_transactions = int(line[1]) + if num_transactions == 0: + continue + sum_latency = int(line[2]) + sum_lag = int(line[3]) + zenbenchmark.record( + f"{node}_num_txns", num_transactions, "txns", MetricReport.HIGHER_IS_BETTER + ) + zenbenchmark.record( + f"{node}_avg_latency", + sum_latency / num_transactions, + "s", + MetricReport.LOWER_IS_BETTER, + ) + zenbenchmark.record( + f"{node}_avg_lag", + sum_lag / num_transactions, + "s", + MetricReport.LOWER_IS_BETTER, + ) + + +@pytest.mark.remote_cluster +@pytest.mark.timeout(2 * 60 * 60) +def test_replication_start_stop( + pg_bin: PgBin, + test_output_dir: Path, + neon_api: NeonAPI, + pg_version: PgVersion, + zenbenchmark: NeonBenchmarker, +): + """ + Cycles through different configurations of read replicas being enabled disabled. The whole time, + there's a pgbench read/write workload going on the master. For each replica, we either turn it + on or off, and see how long it takes to catch up after some set amount of time of replicating + the pgbench. + """ + + prefix = "pgbench_agg" + num_replicas = 2 + configuration_test_time_sec = 10 * 60 + pgbench_duration = f"-T{2 ** num_replicas * configuration_test_time_sec}" + error_occurred = False + + project = neon_api.create_project(pg_version) + project_id = project["project"]["id"] + neon_api.wait_for_operation_to_finish(project_id) + try: + branch_id = project["branch"]["id"] + master_connstr = project["connection_uris"][0]["connection_uri"] + master_env = connection_parameters_to_env( + project["connection_uris"][0]["connection_parameters"] + ) + + replicas = [] + for _ in range(num_replicas): + replicas.append( + neon_api.create_endpoint( + project_id, + branch_id, + endpoint_type="read_only", + settings={"pg_settings": {"hot_standby_feedback": "on"}}, + ) + ) + neon_api.wait_for_operation_to_finish(project_id) + + replica_connstr = [ + neon_api.get_connection_uri( + project_id, + endpoint_id=replicas[i]["endpoint"]["id"], + )["uri"] + for i in range(num_replicas) + ] + replica_env = [master_env.copy() for _ in range(num_replicas)] + for i in range(num_replicas): + replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"] + + pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env) + + # Sync replicas + with psycopg2.connect(master_connstr) as conn_master: + with conn_master.cursor() as cur_master: + for i in range(num_replicas): + conn_replica = psycopg2.connect(replica_connstr[i]) + measure_replication_lag(cur_master, conn_replica.cursor()) + + master_pgbench = pg_bin.run_nonblocking( + [ + "pgbench", + "-c10", + pgbench_duration, + "-Mprepared", + "--log", + f"--log-prefix={test_output_dir}/{prefix}_master", + f"--aggregate-interval={configuration_test_time_sec}", + ], + env=master_env, + ) + replica_pgbench: List[Optional[subprocess.Popen[Any]]] = [None for _ in range(num_replicas)] + + # Use the bits of iconfig to tell us which configuration we are on. For example + # a iconfig of 2 is 10 in binary, indicating replica 0 is suspended and replica 1 is + # alive. + for iconfig in range((1 << num_replicas) - 1, -1, -1): + + def replica_enabled(iconfig: int = iconfig): + return bool((iconfig >> 1) & 1) + + # Change configuration + for ireplica in range(num_replicas): + if replica_enabled() and replica_pgbench[ireplica] is None: + replica_pgbench[ireplica] = pg_bin.run_nonblocking( + [ + "pgbench", + "-c10", + "-S", + pgbench_duration, + "--log", + f"--log-prefix={test_output_dir}/{prefix}_replica_{ireplica}", + f"--aggregate-interval={configuration_test_time_sec}", + ], + env=replica_env[ireplica], + ) + elif not replica_enabled() and replica_pgbench[ireplica] is not None: + pgb = replica_pgbench[ireplica] + assert pgb is not None + pgb.terminate() + pgb.wait() + replica_pgbench[ireplica] = None + + neon_api.suspend_endpoint( + project_id, + replicas[ireplica]["endpoint"]["id"], + ) + neon_api.wait_for_operation_to_finish(project_id) + + time.sleep(configuration_test_time_sec) + + with psycopg2.connect(master_connstr) as conn_master: + with conn_master.cursor() as cur_master: + for ireplica in range(num_replicas): + replica_conn = psycopg2.connect(replica_connstr[ireplica]) + lag = measure_replication_lag(cur_master, replica_conn.cursor()) + zenbenchmark.record( + f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER + ) + log.info( + f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}" + ) + master_pgbench.terminate() + except Exception as e: + error_occurred = True + log.error(f"Caught exception {e}") + log.error(traceback.format_exc()) + finally: + assert not error_occurred + neon_api.delete_project(project_id) + # Only report results if we didn't error out + report_pgbench_aggregate_intervals(test_output_dir, prefix, zenbenchmark) diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py index a4c8c8ac421a..3a6113706fa9 100644 --- a/test_runner/performance/test_storage_controller_scale.py +++ b/test_runner/performance/test_storage_controller_scale.py @@ -1,18 +1,89 @@ import concurrent.futures import random import time +from collections import defaultdict +from typing import Any, Dict import pytest from fixtures.common_types import TenantId, TenantShardId, TimelineId from fixtures.compute_reconfigure import ComputeReconfigure from fixtures.log_helper import log -from fixtures.neon_fixtures import ( - NeonEnvBuilder, -) +from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder from fixtures.pageserver.http import PageserverHttpClient from fixtures.pg_version import PgVersion +def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]: + """ + Get the number of shards attached to each node. + This function takes into account the intersection of the intent and the observed state. + If they do not match, it asserts out. + """ + tenants = env.storage_controller.tenant_list() + + intent = dict() + observed = dict() + + tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict( + lambda: { + "observed": {"attached": None, "secondary": []}, + "intent": {"attached": None, "secondary": []}, + } + ) + + for t in tenants: + for node_id, loc_state in t["observed"]["locations"].items(): + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] + in set(["AttachedSingle", "AttachedMulti", "AttachedStale"]) + ): + observed[t["tenant_shard_id"]] = int(node_id) + tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id) + + if ( + loc_state is not None + and "conf" in loc_state + and loc_state["conf"] is not None + and loc_state["conf"]["mode"] == "Secondary" + ): + tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(int(node_id)) + + if "attached" in t["intent"]: + intent[t["tenant_shard_id"]] = t["intent"]["attached"] + tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"]["attached"] + + if "secondary" in t["intent"]: + tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][ + "secondary" + ] + + log.info(f"{tenant_placement=}") + + matching = { + tid: intent[tid] for tid in observed if tid in intent and intent[tid] == observed[tid] + } + assert len(matching) == total_shards + + attached_per_node: defaultdict[str, int] = defaultdict(int) + for node_id in matching.values(): + attached_per_node[node_id] += 1 + + return attached_per_node + + +def assert_consistent_balanced_attachments(env: NeonEnv, total_shards): + attached_per_node = get_consistent_node_shard_counts(env, total_shards) + + min_shard_count = min(attached_per_node.values()) + max_shard_count = max(attached_per_node.values()) + + flake_factor = 5 / 100 + assert max_shard_count - min_shard_count <= int(total_shards * flake_factor) + + @pytest.mark.timeout(3600) # super long running test: should go down as we optimize def test_storage_controller_many_tenants( neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure @@ -44,7 +115,8 @@ def test_storage_controller_many_tenants( # A small sleep on each call into the notify hook, to simulate the latency of doing a database write compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01)) - env = neon_env_builder.init_start() + env = neon_env_builder.init_configs() + neon_env_builder.start() # We will intentionally stress reconciler concurrrency, which triggers a warning when lots # of shards are hitting the delayed path. @@ -60,14 +132,6 @@ def test_storage_controller_many_tenants( ) for ps in env.pageservers: - # This can happen because when we do a loop over all pageservers and mark them offline/active, - # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of - # bumping generation before other attachments are detached. - # - # We could clean this up by making reconcilers respect the .observed of their predecessor, if - # we spawn with a wait for the predecessor. - ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - # Storage controller is allowed to drop pageserver requests when the cancellation token # for a Reconciler fires. ps.allowed_errors.append(".*request was dropped before completing.*") @@ -79,6 +143,8 @@ def test_storage_controller_many_tenants( shard_count = 2 stripe_size = 1024 + total_shards = tenant_count * shard_count + tenants = set(TenantId.generate() for _i in range(0, tenant_count)) virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) @@ -195,10 +261,44 @@ def check_memory(): env.storage_controller.consistency_check() check_memory() - # Restart pageservers: this exercises the /re-attach API - for pageserver in env.pageservers: - pageserver.stop() - pageserver.start() + shard_counts = get_consistent_node_shard_counts(env, total_shards) + log.info(f"Shard counts before rolling restart: {shard_counts}") + + assert_consistent_balanced_attachments(env, total_shards) + + # Restart pageservers gracefully: this exercises the /re-attach pageserver API + # and the storage controller drain and fill API + for ps in env.pageservers: + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 + ) + + env.storage_controller.poll_node_status( + ps.id, "PauseForRestart", max_attempts=24, backoff=5 + ) + + shard_counts = get_consistent_node_shard_counts(env, total_shards) + log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") + # Assert that we've drained the node + assert shard_counts[str(ps.id)] == 0 + # Assert that those shards actually went somewhere + assert sum(shard_counts.values()) == total_shards + + ps.restart() + env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=1) + + env.storage_controller.retryable_node_operation( + lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 + ) + env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=5) + + shard_counts = get_consistent_node_shard_counts(env, total_shards) + log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") + + assert_consistent_balanced_attachments(env, total_shards) + + env.storage_controller.reconcile_until_idle() + env.storage_controller.consistency_check() # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn, # as they were not offline long enough to trigger any scheduling changes. diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile index 7e074e07b836..7c2b1b40e091 100644 --- a/test_runner/pg_clients/java/jdbc/Dockerfile +++ b/test_runner/pg_clients/java/jdbc/Dockerfile @@ -1,4 +1,4 @@ -FROM openjdk:21 +FROM openjdk:22 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt index e086a937e68b..099a4ade2c4d 100644 --- a/test_runner/pg_clients/python/pg8000/requirements.txt +++ b/test_runner/pg_clients/python/pg8000/requirements.txt @@ -1,2 +1,2 @@ -pg8000==1.30.5 +pg8000==1.31.2 scramp>=1.4.3 diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock index a4a2426b97ec..32c1c52eea44 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "addr2line" -version = "0.21.0" +version = "0.22.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb" +checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678" dependencies = [ "gimli", ] @@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" [[package]] name = "async-trait" -version = "0.1.77" +version = "0.1.80" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9" +checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" dependencies = [ "proc-macro2", "quote", @@ -30,15 +30,15 @@ dependencies = [ [[package]] name = "autocfg" -version = "1.1.0" +version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" [[package]] name = "backtrace" -version = "0.3.69" +version = "0.3.73" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837" +checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a" dependencies = [ "addr2line", "cc", @@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" [[package]] name = "bitflags" -version = "2.4.2" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf" +checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de" [[package]] name = "block-buffer" @@ -78,9 +78,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.15.3" +version = "3.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b" +checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" [[package]] name = "byteorder" @@ -90,15 +90,15 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.5.0" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223" +checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9" [[package]] name = "cc" -version = "1.0.89" +version = "1.0.101" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a0ba8f7aaa012f30d5b2861462f6708eccd49c3c39863fe083a308035f63d723" +checksum = "ac367972e516d45567c7eafc73d24e1c193dcf200a8d94e9db7b3d38b349572d" [[package]] name = "cfg-if" @@ -154,9 +154,9 @@ dependencies = [ [[package]] name = "errno" -version = "0.3.8" +version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245" +checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba" dependencies = [ "libc", "windows-sys 0.52.0", @@ -170,15 +170,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7" [[package]] name = "fastrand" -version = "2.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5" - -[[package]] -name = "finl_unicode" -version = "1.2.0" +version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6" +checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" [[package]] name = "foreign-types" @@ -296,9 +290,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.12" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", @@ -307,9 +301,9 @@ dependencies = [ [[package]] name = "gimli" -version = "0.28.1" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253" +checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" [[package]] name = "hmac" @@ -329,29 +323,23 @@ dependencies = [ "wasm-bindgen", ] -[[package]] -name = "lazy_static" -version = "1.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" - [[package]] name = "libc" -version = "0.2.153" +version = "0.2.155" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd" +checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" [[package]] name = "linux-raw-sys" -version = "0.4.13" +version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c" +checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89" [[package]] name = "lock_api" -version = "0.4.11" +version = "0.4.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" dependencies = [ "autocfg", "scopeguard", @@ -375,15 +363,15 @@ dependencies = [ [[package]] name = "memchr" -version = "2.7.1" +version = "2.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" [[package]] name = "miniz_oxide" -version = "0.7.2" +version = "0.7.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7" +checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08" dependencies = [ "adler", ] @@ -401,11 +389,10 @@ dependencies = [ [[package]] name = "native-tls" -version = "0.2.11" +version = "0.2.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e" +checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466" dependencies = [ - "lazy_static", "libc", "log", "openssl", @@ -419,9 +406,9 @@ dependencies = [ [[package]] name = "object" -version = "0.32.2" +version = "0.36.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441" +checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434" dependencies = [ "memchr", ] @@ -438,7 +425,7 @@ version = "0.10.64" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f" dependencies = [ - "bitflags 2.4.2", + "bitflags 2.6.0", "cfg-if", "foreign-types", "libc", @@ -466,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" [[package]] name = "openssl-sys" -version = "0.9.101" +version = "0.9.102" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff" +checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2" dependencies = [ "cc", "libc", @@ -478,9 +465,9 @@ dependencies = [ [[package]] name = "parking_lot" -version = "0.12.1" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" dependencies = [ "lock_api", "parking_lot_core", @@ -488,15 +475,15 @@ dependencies = [ [[package]] name = "parking_lot_core" -version = "0.9.9" +version = "0.9.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" dependencies = [ "cfg-if", "libc", - "redox_syscall", + "redox_syscall 0.5.2", "smallvec", - "windows-targets 0.48.5", + "windows-targets 0.52.5", ] [[package]] @@ -525,9 +512,9 @@ dependencies = [ [[package]] name = "pin-project-lite" -version = "0.2.13" +version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58" +checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02" [[package]] name = "pin-utils" @@ -591,18 +578,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" [[package]] name = "proc-macro2" -version = "1.0.78" +version = "1.0.86" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" +checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.35" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] @@ -646,6 +633,15 @@ dependencies = [ "bitflags 1.3.2", ] +[[package]] +name = "redox_syscall" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd" +dependencies = [ + "bitflags 2.6.0", +] + [[package]] name = "rust-neon-example" version = "0.1.0" @@ -658,17 +654,17 @@ dependencies = [ [[package]] name = "rustc-demangle" -version = "0.1.23" +version = "0.1.24" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustix" -version = "0.38.31" +version = "0.38.34" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949" +checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" dependencies = [ - "bitflags 2.4.2", + "bitflags 2.6.0", "errno", "libc", "linux-raw-sys", @@ -692,11 +688,11 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "security-framework" -version = "2.9.2" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de" +checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0" dependencies = [ - "bitflags 1.3.2", + "bitflags 2.6.0", "core-foundation", "core-foundation-sys", "libc", @@ -705,9 +701,9 @@ dependencies = [ [[package]] name = "security-framework-sys" -version = "2.9.1" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a" +checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7" dependencies = [ "core-foundation-sys", "libc", @@ -741,15 +737,15 @@ dependencies = [ [[package]] name = "smallvec" -version = "1.13.1" +version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" [[package]] name = "socket2" -version = "0.5.6" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871" +checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c" dependencies = [ "libc", "windows-sys 0.52.0", @@ -757,26 +753,26 @@ dependencies = [ [[package]] name = "stringprep" -version = "0.1.4" +version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6" +checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1" dependencies = [ - "finl_unicode", "unicode-bidi", "unicode-normalization", + "unicode-properties", ] [[package]] name = "subtle" -version = "2.5.0" +version = "2.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" [[package]] name = "syn" -version = "2.0.52" +version = "2.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07" +checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" dependencies = [ "proc-macro2", "quote", @@ -797,9 +793,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50" +checksum = "c55115c6fbe2d2bef26eb09ad74bde02d8255476fc0c7b515ef09fbb35742d82" dependencies = [ "tinyvec_macros", ] @@ -812,9 +808,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.36.0" +version = "1.38.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931" +checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a" dependencies = [ "backtrace", "bytes", @@ -828,9 +824,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.2.0" +version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b" +checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", @@ -875,35 +871,15 @@ dependencies = [ [[package]] name = "tokio-util" -version = "0.7.10" +version = "0.7.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15" +checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1" dependencies = [ "bytes", "futures-core", "futures-sink", "pin-project-lite", "tokio", - "tracing", -] - -[[package]] -name = "tracing" -version = "0.1.40" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" -dependencies = [ - "pin-project-lite", - "tracing-core", -] - -[[package]] -name = "tracing-core" -version = "0.1.32" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" -dependencies = [ - "once_cell", ] [[package]] @@ -933,6 +909,12 @@ dependencies = [ "tinyvec", ] +[[package]] +name = "unicode-properties" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291" + [[package]] name = "vcpkg" version = "0.2.15" @@ -1023,11 +1005,11 @@ dependencies = [ [[package]] name = "whoami" -version = "1.5.0" +version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fec781d48b41f8163426ed18e8fc2864c12937df9ce54c88ede7bd47270893e" +checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9" dependencies = [ - "redox_syscall", + "redox_syscall 0.4.1", "wasite", "web-sys", ] @@ -1047,7 +1029,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.4", + "windows-targets 0.52.5", ] [[package]] @@ -1067,17 +1049,18 @@ dependencies = [ [[package]] name = "windows-targets" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b" +checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb" dependencies = [ - "windows_aarch64_gnullvm 0.52.4", - "windows_aarch64_msvc 0.52.4", - "windows_i686_gnu 0.52.4", - "windows_i686_msvc 0.52.4", - "windows_x86_64_gnu 0.52.4", - "windows_x86_64_gnullvm 0.52.4", - "windows_x86_64_msvc 0.52.4", + "windows_aarch64_gnullvm 0.52.5", + "windows_aarch64_msvc 0.52.5", + "windows_i686_gnu 0.52.5", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.5", + "windows_x86_64_gnu 0.52.5", + "windows_x86_64_gnullvm 0.52.5", + "windows_x86_64_msvc 0.52.5", ] [[package]] @@ -1088,9 +1071,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" [[package]] name = "windows_aarch64_gnullvm" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9" +checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263" [[package]] name = "windows_aarch64_msvc" @@ -1100,9 +1083,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" [[package]] name = "windows_aarch64_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675" +checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6" [[package]] name = "windows_i686_gnu" @@ -1112,9 +1095,15 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" [[package]] name = "windows_i686_gnu" -version = "0.52.4" +version = "0.52.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3" +checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9" [[package]] name = "windows_i686_msvc" @@ -1124,9 +1113,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" [[package]] name = "windows_i686_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02" +checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf" [[package]] name = "windows_x86_64_gnu" @@ -1136,9 +1125,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" [[package]] name = "windows_x86_64_gnu" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03" +checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9" [[package]] name = "windows_x86_64_gnullvm" @@ -1148,9 +1137,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" [[package]] name = "windows_x86_64_gnullvm" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177" +checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596" [[package]] name = "windows_x86_64_msvc" @@ -1160,6 +1149,6 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" [[package]] name = "windows_x86_64_msvc" -version = "0.52.4" +version = "0.52.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8" +checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml index 0f420e5b0643..27d01810bd52 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml +++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml @@ -7,9 +7,9 @@ publish = false # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -native-tls = "0.2.11" +native-tls = "0.2.12" postgres-native-tls = "0.5.0" -tokio = { version = "1.36", features=["rt", "macros"] } +tokio = { version = "1.38", features=["rt", "macros"] } tokio-postgres = "0.7.10" diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile index 8611e66cbb67..3e214de785b3 100644 --- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile +++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile @@ -1,4 +1,4 @@ -FROM rust:1.76 +FROM rust:1.79 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile index 04028388207c..6006e61ee22e 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile @@ -1,11 +1,11 @@ -FROM swift:5.9 AS build +FROM swift:5.10 AS build RUN apt-get -q update && apt-get -q install -y libssl-dev WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.9 +FROM swift:5.10 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresClientKitExample"] diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved index 767443a9ddcc..6e8613095f15 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved @@ -1,4 +1,5 @@ { + "originHash" : "8eff8c577ba246ce7824d3434839acefced2b1a1d2b1ad700554502538a50558", "pins" : [ { "identity" : "bluesocket", @@ -18,15 +19,6 @@ "version" : "2.0.2" } }, - { - "identity" : "openssl", - "kind" : "remoteSourceControl", - "location" : "https://github.com/Kitura/OpenSSL.git", - "state" : { - "revision" : "5dc8cb4f971135c17343e3c6df4f28904a0600e2", - "version" : "2.3.1" - } - }, { "identity" : "postgresclientkit", "kind" : "remoteSourceControl", @@ -37,5 +29,5 @@ } } ], - "version" : 2 + "version" : 3 } diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift index 48320dd02314..a66d09c542f9 100644 --- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift @@ -1,4 +1,4 @@ -// swift-tools-version:5.8 +// swift-tools-version:5.10 import PackageDescription let package = Package( diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile index 9130e0973f8e..d6815fbb5fa2 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile @@ -1,10 +1,10 @@ -FROM swift:5.9 AS build +FROM swift:5.10 AS build WORKDIR /source COPY . . RUN swift build --configuration release -FROM swift:5.9 +FROM swift:5.10 WORKDIR /app COPY --from=build /source/.build/release . CMD ["/app/PostgresNIOExample"] diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved index 023e03a7b1a0..0e5dfdafcb0a 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved @@ -1,12 +1,22 @@ { + "originHash" : "11b5dcece349a3e56a7a9a7d0af6d0f5b83dff321b43124a01b158ed7aac5302", "pins" : [ { "identity" : "postgres-nio", "kind" : "remoteSourceControl", "location" : "https://github.com/vapor/postgres-nio.git", "state" : { - "revision" : "69ccfdf4c80144d845e3b439961b7ec6cd7ae33f", - "version" : "1.20.2" + "revision" : "5c268768890b062803a49f1358becc478f954265", + "version" : "1.21.5" + } + }, + { + "identity" : "swift-async-algorithms", + "kind" : "remoteSourceControl", + "location" : "https://github.com/apple/swift-async-algorithms.git", + "state" : { + "revision" : "da4e36f86544cdf733a40d59b3a2267e3a7bbf36", + "version" : "1.0.0" } }, { @@ -81,6 +91,15 @@ "version" : "1.20.1" } }, + { + "identity" : "swift-service-lifecycle", + "kind" : "remoteSourceControl", + "location" : "https://github.com/swift-server/swift-service-lifecycle.git", + "state" : { + "revision" : "d58e6bf2b1ae2884cf204a8b5bcaaa7aae3c1ff0", + "version" : "2.6.0" + } + }, { "identity" : "swift-system", "kind" : "remoteSourceControl", @@ -91,5 +110,5 @@ } } ], - "version" : 2 + "version" : 3 } diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift index 637eb4bc9ddb..20bb10f76c37 100644 --- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift +++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift @@ -1,10 +1,10 @@ -// swift-tools-version:5.9 +// swift-tools-version:5.10 import PackageDescription let package = Package( name: "PostgresNIOExample", dependencies: [ - .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.20.2") + .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.21.5") ], targets: [ .executableTarget( diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile index 004b383749f9..45e8753f7eec 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile +++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile @@ -1,4 +1,4 @@ -FROM node:21 +FROM node:22 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json index b4f8587eacef..19311808b6b8 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json @@ -5,7 +5,7 @@ "packages": { "": { "dependencies": { - "postgresql-client": "2.10.5" + "postgresql-client": "2.11.0" } }, "node_modules/doublylinked": { @@ -42,9 +42,10 @@ } }, "node_modules/postgresql-client": { - "version": "2.10.5", - "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.10.5.tgz", - "integrity": "sha512-R3EC16pUdbgrzk1J2MQLj7jY2TepWurJHoK90nOeLZj1XTpL/+wL1VCneTmclRVKDuKVjFHr+FASV47KrLpAbw==", + "version": "2.11.0", + "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.11.0.tgz", + "integrity": "sha512-QSPHcWVaiBG+JyASaDojOXvhRmsc2n8j2COdIjUDENFAtFls16Zy240asY2ENzZRQJUMAA8vpR8w4SAdI8jdbw==", + "license": "MIT", "dependencies": { "doublylinked": "^2.5.4", "lightning-pool": "^4.2.2", @@ -55,8 +56,7 @@ "putil-varhelpers": "^1.6.5" }, "engines": { - "node": ">=16.0", - "npm": ">=7.0.0" + "node": ">=16.0" } }, "node_modules/power-tasks": { diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json index 07ec100d0d22..d2bba23d2912 100644 --- a/test_runner/pg_clients/typescript/postgresql-client/package.json +++ b/test_runner/pg_clients/typescript/postgresql-client/package.json @@ -1,6 +1,6 @@ { "type": "module", "dependencies": { - "postgresql-client": "2.10.5" + "postgresql-client": "2.11.0" } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile index 004b383749f9..45e8753f7eec 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile +++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile @@ -1,4 +1,4 @@ -FROM node:21 +FROM node:22 WORKDIR /source COPY . . diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json index f3b456f1edc7..7f3f7f2e84e7 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json @@ -5,96 +5,138 @@ "packages": { "": { "dependencies": { - "@neondatabase/serverless": "0.9.0", + "@neondatabase/serverless": "0.9.4", "ws": "8.17.1" } }, "node_modules/@neondatabase/serverless": { - "version": "0.9.0", - "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.0.tgz", - "integrity": "sha512-mmJnUAzlzvxNSZuuhI6kgJjH+JgFdBMYUWxihtq/nj0Tjt+Y5UU3W+SvRFoucnd5NObYkuLYQzk+zV5DGFKGJg==", + "version": "0.9.4", + "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.4.tgz", + "integrity": "sha512-D0AXgJh6xkf+XTlsO7iwE2Q1w8981E1cLCPAALMU2YKtkF/1SF6BiAzYARZFYo175ON+b1RNIy9TdSFHm5nteg==", + "license": "MIT", "dependencies": { - "@types/pg": "8.6.6" + "@types/pg": "8.11.6" } }, "node_modules/@types/node": { - "version": "18.16.3", - "resolved": "https://registry.npmjs.org/@types/node/-/node-18.16.3.tgz", - "integrity": "sha512-OPs5WnnT1xkCBiuQrZA4+YAV4HEJejmHneyraIaxsbev5yCEr6KMwINNFP9wQeFIw8FWcoTqF3vQsa5CDaI+8Q==" + "version": "20.14.9", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.9.tgz", + "integrity": "sha512-06OCtnTXtWOZBJlRApleWndH4JsRVs1pDCc8dLSQp+7PpUpX3ePdHyeNSFTeSe7FtKyQkrlPvHwJOW3SLd8Oyg==", + "license": "MIT", + "dependencies": { + "undici-types": "~5.26.4" + } }, "node_modules/@types/pg": { - "version": "8.6.6", - "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.6.6.tgz", - "integrity": "sha512-O2xNmXebtwVekJDD+02udOncjVcMZQuTEQEMpKJ0ZRf5E7/9JJX3izhKUcUifBkyKpljyUM6BTgy2trmviKlpw==", + "version": "8.11.6", + "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.11.6.tgz", + "integrity": "sha512-/2WmmBXHLsfRqzfHW7BNZ8SbYzE8OSk7i3WjFYvfgRHj7S1xj+16Je5fUKv3lVdVzk/zn9TXOqf+avFCFIE0yQ==", + "license": "MIT", "dependencies": { "@types/node": "*", "pg-protocol": "*", - "pg-types": "^2.2.0" + "pg-types": "^4.0.1" } }, + "node_modules/obuf": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz", + "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==", + "license": "MIT" + }, "node_modules/pg-int8": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz", "integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==", + "license": "ISC", "engines": { "node": ">=4.0.0" } }, + "node_modules/pg-numeric": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/pg-numeric/-/pg-numeric-1.0.2.tgz", + "integrity": "sha512-BM/Thnrw5jm2kKLE5uJkXqqExRUY/toLHda65XgFTBTFYZyopbKjBe29Ii3RbkvlsMoFwD+tHeGaCjjv0gHlyw==", + "license": "ISC", + "engines": { + "node": ">=4" + } + }, "node_modules/pg-protocol": { - "version": "1.6.0", - "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.0.tgz", - "integrity": "sha512-M+PDm637OY5WM307051+bsDia5Xej6d9IR4GwJse1qA1DIhiKlksvrneZOYQq42OM+spubpcNYEo2FcKQrDk+Q==" + "version": "1.6.1", + "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.1.tgz", + "integrity": "sha512-jPIlvgoD63hrEuihvIg+tJhoGjUsLPn6poJY9N5CnlPd91c2T18T/9zBtLxZSb1EhYxBRoZJtzScCaWlYLtktg==", + "license": "MIT" }, "node_modules/pg-types": { - "version": "2.2.0", - "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz", - "integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==", + "version": "4.0.2", + "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-4.0.2.tgz", + "integrity": "sha512-cRL3JpS3lKMGsKaWndugWQoLOCoP+Cic8oseVcbr0qhPzYD5DWXK+RZ9LY9wxRf7RQia4SCwQlXk0q6FCPrVng==", + "license": "MIT", "dependencies": { "pg-int8": "1.0.1", - "postgres-array": "~2.0.0", - "postgres-bytea": "~1.0.0", - "postgres-date": "~1.0.4", - "postgres-interval": "^1.1.0" + "pg-numeric": "1.0.2", + "postgres-array": "~3.0.1", + "postgres-bytea": "~3.0.0", + "postgres-date": "~2.1.0", + "postgres-interval": "^3.0.0", + "postgres-range": "^1.1.1" }, "engines": { - "node": ">=4" + "node": ">=10" } }, "node_modules/postgres-array": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz", - "integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==", + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-3.0.2.tgz", + "integrity": "sha512-6faShkdFugNQCLwucjPcY5ARoW1SlbnrZjmGl0IrrqewpvxvhSLHimCVzqeuULCbG0fQv7Dtk1yDbG3xv7Veog==", + "license": "MIT", "engines": { - "node": ">=4" + "node": ">=12" } }, "node_modules/postgres-bytea": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.0.tgz", - "integrity": "sha512-xy3pmLuQqRBZBXDULy7KbaitYqLcmxigw14Q5sj8QBVLqEwXfeybIKVWiqAXTlcvdvb0+xkOtDbfQMOf4lST1w==", + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-3.0.0.tgz", + "integrity": "sha512-CNd4jim9RFPkObHSjVHlVrxoVQXz7quwNFpz7RY1okNNme49+sVyiTvTRobiLV548Hx/hb1BG+iE7h9493WzFw==", + "license": "MIT", + "dependencies": { + "obuf": "~1.1.2" + }, "engines": { - "node": ">=0.10.0" + "node": ">= 6" } }, "node_modules/postgres-date": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz", - "integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==", + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-2.1.0.tgz", + "integrity": "sha512-K7Juri8gtgXVcDfZttFKVmhglp7epKb1K4pgrkLxehjqkrgPhfG6OO8LHLkfaqkbpjNRnra018XwAr1yQFWGcA==", + "license": "MIT", "engines": { - "node": ">=0.10.0" + "node": ">=12" } }, "node_modules/postgres-interval": { - "version": "1.2.0", - "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz", - "integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==", - "dependencies": { - "xtend": "^4.0.0" - }, + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-3.0.0.tgz", + "integrity": "sha512-BSNDnbyZCXSxgA+1f5UU2GmwhoI0aU5yMxRGO8CdFEcY2BQF9xm/7MqKnYoM1nJDk8nONNWDk9WeSmePFhQdlw==", + "license": "MIT", "engines": { - "node": ">=0.10.0" + "node": ">=12" } }, + "node_modules/postgres-range": { + "version": "1.1.4", + "resolved": "https://registry.npmjs.org/postgres-range/-/postgres-range-1.1.4.tgz", + "integrity": "sha512-i/hbxIE9803Alj/6ytL7UHQxRvZkI9O4Sy+J3HGc4F4oo/2eQAjTSNJ0bfxyse3bH0nuVesCk+3IRLaMtG3H6w==", + "license": "MIT" + }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "license": "MIT" + }, "node_modules/ws": { "version": "8.17.1", "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz", @@ -114,14 +156,6 @@ "optional": true } } - }, - "node_modules/xtend": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", - "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==", - "engines": { - "node": ">=0.4" - } } } } diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json index 3ae7a8a6cfcd..f791d184c5f8 100644 --- a/test_runner/pg_clients/typescript/serverless-driver/package.json +++ b/test_runner/pg_clients/typescript/serverless-driver/package.json @@ -1,7 +1,7 @@ { "type": "module", "dependencies": { - "@neondatabase/serverless": "0.9.0", + "@neondatabase/serverless": "0.9.4", "ws": "8.17.1" } } diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index e117c2140f5e..f2ee2b70aac6 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -21,8 +21,6 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv: [ # eviction might be the first one after an attach to access the layers ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction", - # detach can happen before we get to validate the generation number - ".*deletion backend: Dropped remote consistent LSN updates for tenant.*", ] ) assert isinstance(env.pageserver_remote_storage, LocalFsStorage) @@ -58,10 +56,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N env.pageserver.allowed_errors.extend( [ - # This fixture detaches the tenant, and tests using it will tend to re-attach it - # shortly after. There may be un-processed deletion_queue validations from the - # initial attachment - ".*Dropped remote consistent LSN updates.*", # This fixture is for tests that will intentionally generate 400 responses ".*Error processing HTTP request: Bad request", ] diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 035ab2796f6a..922a21a99929 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -211,7 +211,7 @@ def op(): def check_pageserver(expect_success: bool, **conn_kwargs): check_connection( env.pageserver, - f"get_last_record_rlsn {env.initial_tenant} {timeline_id}", + f"show {env.initial_tenant}", expect_success, **conn_kwargs, ) diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 97ab69049d00..4d2cdb8e320a 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -14,11 +14,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder): ) env = neon_env_builder.init_start() - for pageserver in env.pageservers: - # This test dual-attaches a tenant, one of the pageservers will therefore - # be running with a stale generation. - pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - env.neon_cli.create_branch("test_change_pageserver") endpoint = env.endpoints.create_start("test_change_pageserver") diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py new file mode 100644 index 000000000000..ae3dded437a0 --- /dev/null +++ b/test_runner/regress/test_endpoint_crash.py @@ -0,0 +1,23 @@ +import pytest +from fixtures.neon_fixtures import NeonEnvBuilder + + +@pytest.mark.parametrize( + "sql_func", + [ + "trigger_panic", + "trigger_segfault", + "💣", # calls `trigger_segfault` internally + ], +) +def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str): + """ + Test that triggering crash from neon_test_utils crashes the endpoint + """ + env = neon_env_builder.init_start() + env.neon_cli.create_branch("test_endpoint_crash") + endpoint = env.endpoints.create_start("test_endpoint_crash") + + endpoint.safe_psql("CREATE EXTENSION neon_test_utils;") + with pytest.raises(Exception, match="This probably means the server terminated abnormally"): + endpoint.safe_psql(f"SELECT {sql_func}();") diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index d97e882a7093..4dae9176b83f 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -88,7 +88,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build env.pageserver.allowed_errors.extend( [ - ".*error importing base backup .*", + ".*Failed to import basebackup.*", + ".*unexpected non-zero bytes after the tar archive.*", ".*Timeline got dropped without initializing, cleaning its files.*", ".*InternalServerError.*timeline not found.*", ".*InternalServerError.*Tenant .* not found.*", diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 54d3b2d515c5..3b2218dd9b09 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -39,9 +39,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_configs() env.start() - env.pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py index a6f05fe0f712..6465bdfd217d 100644 --- a/test_runner/regress/test_lfc_working_set_approximation.py +++ b/test_runner/regress/test_lfc_working_set_approximation.py @@ -1,3 +1,4 @@ +import time from pathlib import Path from fixtures.log_helper import log @@ -72,3 +73,46 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv): blocks = query_scalar(cur, "select approximate_working_set_size(true)") log.info(f"working set size after some index access of a few select pages only {blocks}") assert blocks < 10 + + +def test_sliding_working_set_approximation(neon_simple_env: NeonEnv): + env = neon_simple_env + + endpoint = env.endpoints.create_start( + branch_name="main", + config_lines=[ + "autovacuum = off", + "shared_buffers=1MB", + "neon.max_file_cache_size=256MB", + "neon.file_cache_size_limit=245MB", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("create extension neon version '1.4'") + cur.execute( + "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))" + ) + cur.execute("insert into t (pk) values (generate_series(1,1000000))") + time.sleep(2) + before_10k = time.monotonic() + cur.execute("select sum(count) from t where pk between 10000 and 20000") + time.sleep(2) + before_1k = time.monotonic() + cur.execute("select sum(count) from t where pk between 1000 and 2000") + after = time.monotonic() + + cur.execute(f"select approximate_working_set_size_seconds({int(after - before_1k + 1)})") + estimation_1k = cur.fetchall()[0][0] + log.info(f"Working set size for selecting 1k records {estimation_1k}") + + cur.execute(f"select approximate_working_set_size_seconds({int(after - before_10k + 1)})") + estimation_10k = cur.fetchall()[0][0] + log.info(f"Working set size for selecting 10k records {estimation_10k}") + + cur.execute("select pg_table_size('t')") + size = cur.fetchall()[0][0] // 8192 + log.info(f"Table size {size} blocks") + + assert estimation_1k >= 20 and estimation_1k <= 40 + assert estimation_10k >= 200 and estimation_10k <= 400 diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index ca3c81d6e51d..41283e4d2ca0 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -4,7 +4,6 @@ from string import ascii_lowercase import pytest -from fixtures.common_types import Lsn from fixtures.log_helper import log from fixtures.neon_fixtures import ( AuxFileStore, @@ -13,7 +12,7 @@ logical_replication_sync, wait_for_last_flush_lsn, ) -from fixtures.utils import query_scalar, wait_until +from fixtures.utils import wait_until def random_string(n: int): @@ -326,12 +325,17 @@ def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg): assert "could not receive data from WAL stream" not in logs -# Test compute start at LSN page of which starts with contrecord -# https://github.com/neondatabase/neon/issues/5749 +# Test replication of WAL record spanning page boundary (with contrecord) after +# compute restart and WAL write of the page. +# +# See https://github.com/neondatabase/neon/issues/5749 +# +# Most pages start with a contrecord, so we don't do anything special +# to ensure that. @pytest.mark.parametrize( "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation] ) -def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): +def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env env.neon_cli.create_branch("init") @@ -356,52 +360,6 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg): logical_replication_sync(vanilla_pg, endpoint) vanilla_pg.stop() - with endpoint.cursor() as cur: - # measure how much space logical message takes. Sometimes first attempt - # creates huge message and then it stabilizes, have no idea why. - for _ in range(3): - lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - log.info(f"current_lsn={lsn_before}") - # Non-transactional logical message doesn't write WAL, only XLogInsert's - # it, so use transactional. Which is a bit problematic as transactional - # necessitates commit record. Alternatively we can do smth like - # select neon_xlogflush(pg_current_wal_insert_lsn()); - # but isn't much better + that particular call complains on 'xlog flush - # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips - # page headers. - payload = "blahblah" - cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')") - lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before - logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload) - log.info( - f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}" - ) - - # and write logical message spanning exactly as we want - lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - log.info(f"current_lsn={lsn_before}") - curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - offs = int(curr_lsn) % 8192 - till_page = 8192 - offs - payload_len = ( - till_page - logical_message_base - 8 - ) # not sure why 8 is here, it is deduced from experiments - log.info(f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}") - - # payload_len above would go exactly till the page boundary; but we want contrecord, so make it slightly longer - payload_len += 8 - - cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')") - supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()")) - log.info(f"supposedly_page_boundary={supposedly_contrecord_end}") - # The calculations to hit the page boundary are very fuzzy, so just - # ignore test if we fail to reach it. - if not (int(supposedly_contrecord_end) % 8192 == 32): - pytest.skip("missed page boundary, bad luck") - - cur.execute("insert into replication_example values (2, 3)") - wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id) endpoint.stop().start() diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 39b486502672..e83aaf91c60f 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -50,7 +50,7 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): # Ensure that the default version is also updated in the neon.control file assert cur.fetchone() == ("1.3",) cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE") - all_versions = ["1.3", "1.2", "1.1", "1.0"] + all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"] current_version = "1.3" for idx, begin_version in enumerate(all_versions): for target_version in all_versions[idx + 1 :]: diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py index b9e7e642b51c..51e847135efd 100644 --- a/test_runner/regress/test_next_xid.py +++ b/test_runner/regress/test_next_xid.py @@ -7,6 +7,7 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, + VanillaPostgres, import_timeline_from_vanilla_postgres, wait_for_wal_insert_lsn, ) @@ -182,3 +183,275 @@ def test_import_at_2bil( cur = conn.cursor() cur.execute("SELECT count(*) from t") assert cur.fetchone() == (10000 + 1 + 1,) + + +# Constants and macros copied from PostgreSQL multixact.c and headers. These are needed to +# calculate the SLRU segments that a particular multixid or multixid-offsets falls into. +BLCKSZ = 8192 +MULTIXACT_OFFSETS_PER_PAGE = int(BLCKSZ / 4) +SLRU_PAGES_PER_SEGMENT = int(32) +MXACT_MEMBER_BITS_PER_XACT = 8 +MXACT_MEMBER_FLAGS_PER_BYTE = 1 +MULTIXACT_FLAGBYTES_PER_GROUP = 4 +MULTIXACT_MEMBERS_PER_MEMBERGROUP = MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE +MULTIXACT_MEMBERGROUP_SIZE = 4 * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP +MULTIXACT_MEMBERGROUPS_PER_PAGE = int(BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE) +MULTIXACT_MEMBERS_PER_PAGE = MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP + + +def MultiXactIdToOffsetSegment(xid: int): + return int(xid / (SLRU_PAGES_PER_SEGMENT * MULTIXACT_OFFSETS_PER_PAGE)) + + +def MXOffsetToMemberSegment(off: int): + return int(off / (SLRU_PAGES_PER_SEGMENT * MULTIXACT_MEMBERS_PER_PAGE)) + + +def advance_multixid_to( + pg_bin: PgBin, vanilla_pg: VanillaPostgres, next_multi_xid: int, next_multi_offset: int +): + """ + Use pg_resetwal to advance the nextMulti and nextMultiOffset values in a stand-alone + Postgres cluster. This is useful to get close to wraparound or some other interesting + value, without having to burn a lot of time consuming the (multi-)XIDs one by one. + + The new values should be higher than the old ones, in a wraparound-aware sense. + + On entry, the server should be running. It will be shut down and restarted. + """ + + # Read old values from the last checkpoint. We will pass the old oldestMultiXid value + # back to pg_resetwal, there's no option to leave it alone. + with vanilla_pg.connect() as conn: + with conn.cursor() as cur: + # Make sure the oldest-multi-xid value in the control file is up-to-date + cur.execute("checkpoint") + cur.execute("select oldest_multi_xid, next_multixact_id from pg_control_checkpoint()") + rec = cur.fetchone() + assert rec is not None + (ckpt_oldest_multi_xid, ckpt_next_multi_xid) = rec + log.info(f"oldestMultiXid was {ckpt_oldest_multi_xid}, nextMultiXid was {ckpt_next_multi_xid}") + log.info(f"Resetting to {next_multi_xid}") + + # Use pg_resetwal to reset the next multiXid and multiOffset to given values. + vanilla_pg.stop() + pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal") + cmd = [ + pg_resetwal_path, + f"--multixact-ids={next_multi_xid},{ckpt_oldest_multi_xid}", + f"--multixact-offset={next_multi_offset}", + "-D", + str(vanilla_pg.pgdatadir), + ] + pg_bin.run_capture(cmd) + + # Because we skip over a lot of values, Postgres hasn't created the SLRU segments for + # the new values yet. Create them manually, to allow Postgres to start up. + # + # This leaves "gaps" in the SLRU where segments between old value and new value are + # missing. That's OK for our purposes. Autovacuum will print some warnings about the + # missing segments, but will clean it up by truncating the SLRUs up to the new value, + # closing the gap. + segname = "%04X" % MultiXactIdToOffsetSegment(next_multi_xid) + log.info(f"Creating dummy segment pg_multixact/offsets/{segname}") + with open(vanilla_pg.pgdatadir / "pg_multixact" / "offsets" / segname, "w") as of: + of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ) + of.flush() + + segname = "%04X" % MXOffsetToMemberSegment(next_multi_offset) + log.info(f"Creating dummy segment pg_multixact/members/{segname}") + with open(vanilla_pg.pgdatadir / "pg_multixact" / "members" / segname, "w") as of: + of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ) + of.flush() + + # Start Postgres again and wait until autovacuum has processed all the databases + # + # This allows truncating the SLRUs, fixing the gaps with missing segments. + vanilla_pg.start() + with vanilla_pg.connect().cursor() as cur: + for _ in range(1000): + datminmxid = int( + query_scalar(cur, "select min(datminmxid::text::int8) from pg_database") + ) + log.info(f"datminmxid {datminmxid}") + if next_multi_xid - datminmxid < 1_000_000: # not wraparound-aware! + break + time.sleep(0.5) + + +def test_multixid_wraparound_import( + neon_env_builder: NeonEnvBuilder, + test_output_dir: Path, + pg_bin: PgBin, + vanilla_pg, +): + """ + Test that the wraparound of the "next-multi-xid" counter is handled correctly in + pageserver, And multi-offsets as well + """ + env = neon_env_builder.init_start() + + # In order to to test multixid wraparound, we need to first advance the counter to + # within spitting distance of the wraparound, that is 2^32 multi-XIDs. We could simply + # run a workload that consumes a lot of multi-XIDs until we approach that, but that + # takes a very long time. So we cheat. + # + # Our strategy is to create a vanilla Postgres cluster, and use pg_resetwal to + # directly set the multi-xid counter a higher value. However, we cannot directly set + # it to just before 2^32 (~ 4 billion), because that would make the exisitng + # 'relminmxid' values to look like they're in the future. It's not clear how the + # system would behave in that situation. So instead, we bump it up ~ 1 billion + # multi-XIDs at a time, and let autovacuum to process all the relations and update + # 'relminmxid' between each run. + # + # XXX: For the multi-offsets, most of the bump is done in the last call. This is + # because advancing it ~ 1 billion at a time hit a pathological case in the + # MultiXactMemberFreezeThreshold() function, causing autovacuum not trigger multixid + # freezing. See + # https://www.postgresql.org/message-id/85fb354c-f89f-4d47-b3a2-3cbd461c90a3%40iki.fi + # Multi-offsets don't have the same wraparound problems at 2 billion mark as + # multi-xids do, so one big jump is fine. + vanilla_pg.configure( + [ + "log_autovacuum_min_duration = 0", + # Perform anti-wraparound vacuuming aggressively + "autovacuum_naptime='1 s'", + "autovacuum_freeze_max_age = 1000000", + "autovacuum_multixact_freeze_max_age = 1000000", + ], + ) + vanilla_pg.start() + advance_multixid_to(pg_bin, vanilla_pg, 0x40000000, 0x10000000) + advance_multixid_to(pg_bin, vanilla_pg, 0x80000000, 0x20000000) + advance_multixid_to(pg_bin, vanilla_pg, 0xC0000000, 0x30000000) + advance_multixid_to(pg_bin, vanilla_pg, 0xFFFFFF00, 0xFFFFFF00) + + vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser") + vanilla_pg.safe_psql("create table tt as select g as id from generate_series(1, 10) g") + vanilla_pg.safe_psql("CHECKPOINT") + + # Import the cluster to the pageserver + tenant_id = TenantId.generate() + env.pageserver.tenant_create(tenant_id) + timeline_id = TimelineId.generate() + import_timeline_from_vanilla_postgres( + test_output_dir, + env, + pg_bin, + tenant_id, + timeline_id, + "imported_multixid_wraparound_test", + vanilla_pg.connstr(), + ) + vanilla_pg.stop() + + endpoint = env.endpoints.create_start( + "imported_multixid_wraparound_test", + tenant_id=tenant_id, + config_lines=[ + "log_autovacuum_min_duration = 0", + "autovacuum_naptime='5 s'", + "autovacuum=off", + ], + ) + conn = endpoint.connect() + cur = conn.cursor() + assert query_scalar(cur, "select count(*) from tt") == 10 # sanity check + + # Install extension containing function needed for test + cur.execute("CREATE EXTENSION neon_test_utils") + + # Consume a lot of XIDs, just to advance the XIDs to different range than the + # multi-xids. That avoids confusion while debugging + cur.execute("select test_consume_xids(100000)") + cur.execute("select pg_switch_wal()") + cur.execute("checkpoint") + + # Use subtransactions so that each row in 'tt' is stamped with different XID. Leave + # the transaction open. + cur.execute("BEGIN") + cur.execute( + """ +do $$ +declare + idvar int; +begin + for idvar in select id from tt loop + begin + update tt set id = idvar where id = idvar; + exception when others then + raise 'didn''t expect an error: %', sqlerrm; + end; + end loop; +end; +$$; +""" + ) + + # In a different transaction, acquire a FOR KEY SHARE lock on each row. This generates + # a new multixid for each row, with the previous xmax and this transaction's XID as the + # members. + # + # Repeat this until the multi-xid counter wraps around. + conn3 = endpoint.connect() + cur3 = conn3.cursor() + next_multixact_id_before_restart = 0 + observed_before_wraparound = False + while True: + cur3.execute("BEGIN") + cur3.execute("SELECT * FROM tt FOR KEY SHARE") + + # Get the xmax of one of the rows we locked. It should be a multi-xid. It might + # not be the latest one, but close enough. + row_xmax = int(query_scalar(cur3, "SELECT xmax FROM tt LIMIT 1")) + cur3.execute("COMMIT") + log.info(f"observed a row with xmax {row_xmax}") + + # High value means not wrapped around yet + if row_xmax >= 0xFFFFFF00: + observed_before_wraparound = True + continue + + # xmax should not be a regular XID. (We bumped up the regular XID range earlier + # to around 100000 and above.) + assert row_xmax < 100 + + # xmax values < FirstNormalTransactionId (== 3) could be special XID values, or + # multixid values after wraparound. We don't know for sure which, so keep going to + # be sure we see value that's unambiguously a wrapped-around multixid + if row_xmax < 3: + continue + + next_multixact_id_before_restart = row_xmax + log.info( + f"next_multixact_id is now at {next_multixact_id_before_restart} or a little higher" + ) + break + + # We should have observed the state before wraparound + assert observed_before_wraparound + + cur.execute("COMMIT") + + # Wait until pageserver has received all the data, and restart the endpoint + wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id) + endpoint.stop(mode="immediate") # 'immediate' to avoid writing shutdown checkpoint + endpoint.start() + + # Check that the next-multixid value wrapped around correctly + conn = endpoint.connect() + cur = conn.cursor() + cur.execute("select next_multixact_id from pg_control_checkpoint()") + next_multixact_id_after_restart = int( + query_scalar(cur, "select next_multixact_id from pg_control_checkpoint()") + ) + log.info(f"next_multixact_id after restart: {next_multixact_id_after_restart}") + assert next_multixact_id_after_restart >= next_multixact_id_before_restart + + # The multi-offset should wrap around as well + cur.execute("select next_multi_offset from pg_control_checkpoint()") + next_multi_offset_after_restart = int( + query_scalar(cur, "select next_multi_offset from pg_control_checkpoint()") + ) + log.info(f"next_multi_offset after restart: {next_multi_offset_after_restart}") + assert next_multi_offset_after_restart < 100000 diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index abbea59113f1..caeae7fd15c6 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -85,8 +85,10 @@ def check_client(env: NeonEnv, client: PageserverHttpClient): # create new tenant and check it is also there tenant_id = TenantId.generate() - client.tenant_create( - tenant_id, generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id) + env.pageserver.tenant_create( + tenant_id, + generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id), + auth_token=client.auth_token, ) assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()} diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 696af24e5c0a..7ce38c5c3c82 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -249,10 +249,6 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder): assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"] assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0 - main_pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - # Now advance the generation in the control plane: subsequent validations # from the running pageserver will fail. No more deletions should happen. env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id) @@ -397,8 +393,6 @@ def assert_deletions_submitted(n: int) -> None: # validated before restart. assert get_deletion_queue_executed(ps_http) == before_restart_depth else: - main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"]) - # If we lost the attachment, we should have dropped our pre-restart deletions. assert get_deletion_queue_dropped(ps_http) == before_restart_depth @@ -553,13 +547,6 @@ def test_multi_attach( tenant_id = env.initial_tenant timeline_id = env.initial_timeline - # We will intentionally create situations where stale deletions happen from non-latest-generation - # nodes when the tenant is multiply-attached - for ps in env.pageservers: - ps.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - # Initially, the tenant will be attached to the first pageserver (first is default in our test harness) wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active")) _detail = http_clients[0].timeline_detail(tenant_id, timeline_id) diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py index aecfcdd262e5..37ff923632d2 100644 --- a/test_runner/regress/test_pageserver_reconnect.py +++ b/test_runner/regress/test_pageserver_reconnect.py @@ -2,6 +2,7 @@ import time from contextlib import closing +import psycopg2.errors from fixtures.log_helper import log from fixtures.neon_fixtures import NeonEnv, PgBin @@ -40,3 +41,26 @@ def run_pgbench(connstr: str): c.execute("select pg_reload_conf()") thread.join() + + +# Test handling errors during page server reconnect +def test_pageserver_reconnect_failure(neon_simple_env: NeonEnv): + env = neon_simple_env + env.neon_cli.create_branch("test_pageserver_reconnect") + endpoint = env.endpoints.create_start("test_pageserver_reconnect") + + con = endpoint.connect() + cur = con.cursor() + + cur.execute("set statement_timeout='2s'") + cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'") + connstring = cur.fetchall()[0][0] + cur.execute( + f"alter system set neon.pageserver_connstring='{connstring}?some_invalid_param=xyz'" + ) + cur.execute("select pg_reload_conf()") + try: + cur.execute("select count(*) from pg_class") + except psycopg2.errors.QueryCanceled: + log.info("Connection to PS failed") + assert not endpoint.log_contains("ERROR: cannot wait on socket event without a socket.*") diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 8431840dc069..0416078ebc67 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -16,6 +16,8 @@ from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage from fixtures.utils import wait_until from fixtures.workload import Workload +from werkzeug.wrappers.request import Request +from werkzeug.wrappers.response import Response # A tenant configuration that is convenient for generating uploads and deletions # without a large amount of postgres traffic. @@ -59,7 +61,7 @@ def evict_random_layers( @pytest.mark.parametrize("seed", [1, 2, 3]) -def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): +def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, seed: int): """ Issue many location configuration changes, ensure that tenants remain readable & we don't get any unexpected errors. We should @@ -73,6 +75,20 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): neon_env_builder.enable_pageserver_remote_storage( remote_storage_kind=s3_storage(), ) + neon_env_builder.control_plane_compute_hook_api = ( + f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach" + ) + + def ignore_notify(request: Request): + # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions), + # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute. + log.info(f"Ignoring storage controller compute notification: {request.json}") + return Response(status=200) + + make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler( + ignore_notify + ) + env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF) pageservers = env.pageservers @@ -83,9 +99,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): for ps in env.pageservers: ps.allowed_errors.extend( [ - # We will make no effort to avoid stale attachments - ".*Dropped remote consistent LSN updates.*", - ".*Dropping stale deletions.*", # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found ".*query handler.*Tenant.*not found.*", # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active @@ -102,6 +115,15 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int): workload.init(env.pageservers[0].id) workload.write_rows(256, env.pageservers[0].id) + # Discourage the storage controller from interfering with the changes we will make directly on the pageserver + env.storage_controller.tenant_policy_update( + tenant_id, + { + "scheduling": "Stop", + }, + ) + env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*") + # We use a fixed seed to make the test reproducible: we want a randomly # chosen order, but not to change the order every time we run the test. rng = random.Random(seed) diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py index 756a2c17c909..54b493ec705d 100644 --- a/test_runner/regress/test_pg_regress.py +++ b/test_runner/regress/test_pg_regress.py @@ -8,8 +8,11 @@ import pytest from fixtures.neon_fixtures import ( + Endpoint, + NeonEnv, NeonEnvBuilder, check_restored_datadir_content, + tenant_get_shards, ) from fixtures.pg_version import PgVersion from fixtures.remote_storage import s3_storage @@ -21,6 +24,97 @@ from pytest import CaptureFixture +TENANT_CONF = { + # Scaled down thresholds so that we are exercising the pageserver beyond just writing + # ephemeral/L0 layers, and because debug-mode code is slow to read from full sized ephemeral layer files. + "pitr_interval": "60s", + "checkpoint_distance": f"{8 * 1024 * 1024}", + "compaction_target_size": f"{8 * 1024 * 1024}", +} + +# # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create. +# # There should have been compactions mid-test as well, this final check is in addition those. +# for (shard, pageserver) in tenant_get_shards(env, env.initial_tenant): +# pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline, force_repartition=True, force_image_layer_creation=True) + + +def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: Endpoint): + """ + After running some opaque tests that create interesting content in a timeline, run + some generic integrity checks that the storage stack is able to reproduce the written + data properly. + """ + + ignored_files: Optional[list[str]] = None + + # Neon handles unlogged relations in a special manner. During a + # basebackup, we ship the init fork as the main fork. This presents a + # problem in that the endpoint's data directory and the basebackup will + # have differences and will fail the eventual file comparison. + # + # Unlogged tables were introduced in version 9.1. ALTER TABLE grew + # support for setting the persistence of a table in 9.5. The reason that + # this doesn't affect versions < 15 (but probably would between 9.1 and + # 9.5) is that all the regression tests that deal with unlogged tables + # up until that point dropped the unlogged tables or set them to logged + # at some point during the test. + # + # In version 15, Postgres grew support for unlogged sequences, and with + # that came a few more regression tests. These tests did not all drop + # the unlogged tables/sequences prior to finishing. + # + # But unlogged sequences came with a bug in that, sequences didn't + # inherit the persistence of their "parent" tables if they had one. This + # was fixed and backported to 15, thus exacerbating our problem a bit. + # + # So what we can do is just ignore file differences between the data + # directory and basebackup for unlogged relations. + results = cast( + "list[tuple[str, str]]", + endpoint.safe_psql( + """ + SELECT + relkind, + pg_relation_filepath( + pg_filenode_relation(reltablespace, relfilenode) + ) AS unlogged_relation_paths + FROM pg_class + WHERE relpersistence = 'u' + """, + dbname=db_name, + ), + ) + + unlogged_relation_files: list[str] = [] + for r in results: + unlogged_relation_files.append(r[1]) + # This is related to the following Postgres commit: + # + # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b + # Author: Heikki Linnakangas + # Date: 2023-08-23 09:21:31 -0500 + # + # Use the buffer cache when initializing an unlogged index. + # + # This patch was backpatched to 16. Without it, the LSN in the + # page header would be 0/0 in the data directory, which wouldn't + # match the LSN generated during the basebackup, thus creating + # a difference. + if env.pg_version <= PgVersion.V15 and r[0] == "i": + unlogged_relation_files.append(f"{r[1]}_init") + + ignored_files = unlogged_relation_files + + check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files) + + # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create. + # There should have been compactions mid-test as well, this final check is in addition those. + for shard, pageserver in tenant_get_shards(env, env.initial_tenant): + pageserver.http_client().timeline_checkpoint( + shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True + ) + + # Run the main PostgreSQL regression tests, in src/test/regress. # @pytest.mark.timeout(600) @@ -45,7 +139,10 @@ def test_pg_regress( neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.enable_scrub_on_exit() - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, + initial_tenant_shard_count=shard_count, + ) # Connect to postgres and create a database called "regression". endpoint = env.endpoints.create_start("main") @@ -84,67 +181,7 @@ def test_pg_regress( with capsys.disabled(): pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - ignored_files: Optional[list[str]] = None - - # Neon handles unlogged relations in a special manner. During a - # basebackup, we ship the init fork as the main fork. This presents a - # problem in that the endpoint's data directory and the basebackup will - # have differences and will fail the eventual file comparison. - # - # Unlogged tables were introduced in version 9.1. ALTER TABLE grew - # support for setting the persistence of a table in 9.5. The reason that - # this doesn't affect versions < 15 (but probably would between 9.1 and - # 9.5) is that all the regression tests that deal with unlogged tables - # up until that point dropped the unlogged tables or set them to logged - # at some point during the test. - # - # In version 15, Postgres grew support for unlogged sequences, and with - # that came a few more regression tests. These tests did not all drop - # the unlogged tables/sequences prior to finishing. - # - # But unlogged sequences came with a bug in that, sequences didn't - # inherit the persistence of their "parent" tables if they had one. This - # was fixed and backported to 15, thus exacerbating our problem a bit. - # - # So what we can do is just ignore file differences between the data - # directory and basebackup for unlogged relations. - results = cast( - "list[tuple[str, str]]", - endpoint.safe_psql( - """ - SELECT - relkind, - pg_relation_filepath( - pg_filenode_relation(reltablespace, relfilenode) - ) AS unlogged_relation_paths - FROM pg_class - WHERE relpersistence = 'u' - """, - dbname=DBNAME, - ), - ) - - unlogged_relation_files: list[str] = [] - for r in results: - unlogged_relation_files.append(r[1]) - # This is related to the following Postgres commit: - # - # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b - # Author: Heikki Linnakangas - # Date: 2023-08-23 09:21:31 -0500 - # - # Use the buffer cache when initializing an unlogged index. - # - # This patch was backpatched to 16. Without it, the LSN in the - # page header would be 0/0 in the data directory, which wouldn't - # match the LSN generated during the basebackup, thus creating - # a difference. - if env.pg_version <= PgVersion.V15 and r[0] == "i": - unlogged_relation_files.append(f"{r[1]}_init") - - ignored_files = unlogged_relation_files - - check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files) + post_checks(env, test_output_dir, DBNAME, endpoint) # Run the PostgreSQL "isolation" tests, in src/test/isolation. @@ -159,16 +196,20 @@ def test_isolation( pg_distrib_dir: Path, shard_count: Optional[int], ): + DBNAME = "isolation_regression" + if shard_count is not None: neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.enable_scrub_on_exit() - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + ) # Connect to postgres and create a database called "regression". # isolation tests use prepared transactions, so enable them endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"]) - endpoint.safe_psql("CREATE DATABASE isolation_regression") + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_isolation_regress to run in. runpath = test_output_dir / "regress" @@ -202,6 +243,9 @@ def test_isolation( with capsys.disabled(): pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath) + # This fails with a mismatch on `pg_multixact/offsets/0000` + # post_checks(env, test_output_dir, DBNAME, endpoint) + # Run extra Neon-specific pg_regress-based tests. The tests and their # schedule file are in the sql_regress/ directory. @@ -215,15 +259,19 @@ def test_sql_regress( pg_distrib_dir: Path, shard_count: Optional[int], ): + DBNAME = "regression" + if shard_count is not None: neon_env_builder.num_pageservers = shard_count neon_env_builder.enable_pageserver_remote_storage(s3_storage()) neon_env_builder.enable_scrub_on_exit() - env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count) + env = neon_env_builder.init_start( + initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count + ) # Connect to postgres and create a database called "regression". endpoint = env.endpoints.create_start("main") - endpoint.safe_psql("CREATE DATABASE regression") + endpoint.safe_psql(f"CREATE DATABASE {DBNAME}") # Create some local directories for pg_regress to run in. runpath = test_output_dir / "regress" @@ -258,4 +306,4 @@ def test_sql_regress( with capsys.disabled(): pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath) - check_restored_datadir_content(test_output_dir, env, endpoint) + post_checks(env, test_output_dir, DBNAME, endpoint) diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py index a1bff32eedd1..043aff686b09 100644 --- a/test_runner/regress/test_physical_replication.py +++ b/test_runner/regress/test_physical_replication.py @@ -1,7 +1,11 @@ +from __future__ import annotations + import random import time +from typing import TYPE_CHECKING -from fixtures.neon_fixtures import NeonEnv +if TYPE_CHECKING: + from fixtures.neon_fixtures import NeonEnv def test_physical_replication(neon_simple_env: NeonEnv): diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index b26bd3422f30..fac7fe9deef6 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -355,13 +355,6 @@ def churn_while_failpoints_active(result): env.pageserver.stop(immediate=True) env.endpoints.stop_all() - # We are about to forcibly drop local dirs. Storage controller will increment generation in re-attach before - # we later increment when actually attaching it again, leading to skipping a generation and potentially getting - # these warnings if there was a durable but un-executed deletion list at time of restart. - env.pageserver.allowed_errors.extend( - [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"] - ) - dir_to_clear = env.pageserver.tenant_dir() shutil.rmtree(dir_to_clear) os.mkdir(dir_to_clear) diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py new file mode 100644 index 000000000000..17d476a8a690 --- /dev/null +++ b/test_runner/regress/test_replica_start.py @@ -0,0 +1,646 @@ +""" +In PostgreSQL, a standby always has to wait for a running-xacts WAL record to +arrive before it can start accepting queries. Furthermore, if there are +transactions with too many subxids (> 64) open to fit in the in-memory subxids +cache, the running-xacts record will be marked as "suboverflowed", and the +standby will need to also wait for the currently in-progress transactions to +finish. + +In Neon, we have an additional mechanism that scans the CLOG at server startup +to determine the list of running transactions, so that the standby can start up +immediately without waiting for the running-xacts record, but that mechanism +only works if the # of active (sub-)transactions is reasonably small. Otherwise +it falls back to waiting. Furthermore, it's somewhat optimistic in using up the +known-assigned XIDs array: if too many transactions with subxids are started in +the primary later, the replay in the replica will crash with "too many +KnownAssignedXids" error. + +This module contains tests for those various cases at standby startup: starting +from shutdown checkpoint, using the CLOG scanning mechanism, waiting for +running-xacts record and for in-progress transactions to finish etc. +""" + +import threading +from contextlib import closing + +import psycopg2 +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup +from fixtures.pg_version import PgVersion +from fixtures.utils import query_scalar, wait_until + +CREATE_SUBXACTS_FUNC = """ +create or replace function create_subxacts(n integer) returns void as $$ +declare + i integer; +begin + for i in 1..n loop + begin + insert into t (payload) values (0); + exception + when others then + raise exception 'caught something: %', sqlerrm; + end; + end loop; +end; $$ language plpgsql +""" + + +def test_replica_start_scan_clog(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup. There is one + transaction active in the primary when the standby is started. The primary + is killed before it has a chance to write a running-xacts record. The + CLOG-scanning at neon startup allows the standby to start up anyway. + + See the module docstring for background. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("CREATE EXTENSION neon_test_utils") + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + primary_cur.execute("select pg_switch_wal()") + + # Start a transaction in the primary. Leave the transaction open. + # + # The transaction has some subtransactions, but not too many to cause the + # CLOG-scanning mechanism to give up. + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(50)") + + # Wait for the WAL to be flushed, but then immediately kill the primary, + # before it has a chance to generate a running-xacts record. + primary_cur.execute("select neon_xlogflush()") + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + primary.stop(mode="immediate") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + # The transaction did not commit, so it should not be visible in the secondary + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (0,) + + +def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup, after + leaving behind crashed transactions. + + See the module docstring for background. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + primary_cur.execute("select pg_switch_wal()") + + # Consume a lot of XIDs, then kill Postgres without giving it a + # chance to write abort records for them. + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(100000)") + primary.stop(mode="immediate") + + # Restart the primary. Do some light work, and shut it down cleanly + primary.start() + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("insert into t (payload) values (0)") + primary.stop(mode="fast") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. (Restarting the primary writes a checkpoint and/or running-xacts + # record, which allows the standby to know that the crashed XIDs are aborted) + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (1,) + + +def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version): + """ + Test that starting a replica works right after the primary has + created a running-xacts record. This may seem like a trivial case, + but during development, we had a bug that was triggered by having + oldestActiveXid == nextXid. Starting right after a running-xacts + record is one way to test that case. + + See the module docstring for background. + """ + env = neon_simple_env + + if env.pg_version == PgVersion.V14 or env.pg_version == PgVersion.V15: + pytest.skip("pg_log_standby_snapshot() function is available only in PG16") + + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + + primary_cur.execute("CREATE EXTENSION neon_test_utils") + primary_cur.execute("select pg_log_standby_snapshot()") + primary_cur.execute("select neon_xlogflush()") + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select 123") + assert secondary_cur.fetchone() == (123,) + + +def test_replica_start_wait_subxids_finish(neon_simple_env: NeonEnv): + """ + Test replica startup when there are a lot of (sub)transactions active in the + primary. That's too many for the CLOG-scanning mechanism to handle, so the + replica has to wait for the large transaction to finish before it starts to + accept queries. + + After replica startup, test MVCC with transactions that were in-progress + when the replica was started. + + See the module docstring for background. + """ + + # Initialize the primary, a test table, and a helper function to create + # lots of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Start a transaction with 100000 subtransactions, and leave it open. That's + # too many to fit in the "known-assigned XIDs array" in the replica, and + # also too many to fit in the subxid caches so the running-xacts record will + # also overflow. + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(100000)") + + # Start another, smaller transaction in the primary. We'll come back to this + # later. + primary_conn2 = primary.connect() + primary_cur2 = primary_conn2.cursor() + primary_cur2.execute("begin") + primary_cur2.execute("insert into t (payload) values (0)") + + # Create a replica. but before that, wait for the wal to be flushed to + # safekeepers, so that the replica is started at a point where the large + # transaction is already active. (The whole transaction might not be flushed + # yet, but that's OK.) + # + # Start it in a separate thread, so that we can do other stuff while it's + # blocked waiting for the startup to finish. + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + secondary = env.endpoints.new_replica(origin=primary, endpoint_id="secondary") + start_secondary_thread = threading.Thread(target=secondary.start) + start_secondary_thread.start() + + # Verify that the replica has otherwise started up, but cannot start + # accepting queries yet. + log.info("Waiting 5 s to verify that the secondary does not start") + start_secondary_thread.join(5) + assert secondary.log_contains("consistent recovery state reached") + assert secondary.log_contains("started streaming WAL from primary") + # The "redo starts" message is printed when the first WAL record is + # received. It might or might not be present in the log depending on how + # far exactly the WAL was flushed when the replica was started, and whether + # background activity caused any more WAL records to be flushed on the + # primary afterwards. + # + # assert secondary.log_contains("redo # starts") + + # should not be open for connections yet + assert start_secondary_thread.is_alive() + assert not secondary.is_running() + assert not secondary.log_contains("database system is ready to accept read-only connections") + + # Commit the large transaction in the primary. + # + # Within the next 15 s, the primary should write a new running-xacts record + # to the WAL which shows the transaction as completed. Once the replica + # replays that record, it will start accepting queries. + primary_cur.execute("commit") + start_secondary_thread.join() + + # Verify that the large transaction is correctly visible in the secondary + # (but not the second, small transaction, which is still in-progress!) + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100000,) + + # Perform some more MVCC testing using the second transaction that was + # started in the primary before the replica was created + primary_cur2.execute("select create_subxacts(10000)") + + # The second transaction still hasn't committed + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("BEGIN ISOLATION LEVEL REPEATABLE READ") + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100000,) + + # Commit the second transaction in the primary + primary_cur2.execute("commit") + + # Should still be invisible to the old snapshot + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100000,) + + # Commit the REPEATABLE READ transaction in the replica. Both + # primary transactions should now be visible to a new snapshot. + secondary_cur.execute("commit") + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (110001,) + + +def test_replica_too_many_known_assigned_xids(neon_simple_env: NeonEnv): + """ + The CLOG-scanning mechanism fills the known-assigned XIDs array + optimistically at standby startup, betting that it can still fit + upcoming transactions replayed later from the WAL in the + array. This test tests what happens when that bet fails and the + known-assigned XID array fills up after the standby has already + been started. The WAL redo will fail with an error: + + FATAL: too many KnownAssignedXids + CONTEXT: WAL redo at 0/1895CB0 for neon/INSERT: off: 25, flags: 0x08; blkref #0: rel 1663/5/16385, blk 64 + + which causes the standby to shut down. + + See the module docstring for background. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("CREATE EXTENSION neon_test_utils") + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Determine how many connections we can use + primary_cur.execute("show max_connections") + max_connections = int(primary_cur.fetchall()[0][0]) + primary_cur.execute("show superuser_reserved_connections") + superuser_reserved_connections = int(primary_cur.fetchall()[0][0]) + n_connections = max_connections - superuser_reserved_connections + n_subxids = 200 + + # Start one top transaction in primary, with lots of subtransactions. This + # uses up much of the known-assigned XIDs space in the standby, but doesn't + # cause it to overflow. + large_p_conn = primary.connect() + large_p_cur = large_p_conn.cursor() + large_p_cur.execute("begin") + large_p_cur.execute(f"select create_subxacts({max_connections} * 30)") + + with closing(primary.connect()) as small_p_conn: + with small_p_conn.cursor() as small_p_cur: + small_p_cur.execute("select create_subxacts(1)") + + # Create a replica at this LSN + primary_cur.execute("select neon_xlogflush()") + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + + # The transaction in primary has not committed yet. + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (1,) + + # Start max number of top transactions in primary, with a lot of + # subtransactions each. We add the subtransactions to each top transaction + # in a round-robin fashion, instead of adding a lot of subtransactions to + # one top transaction at a time. This way, we will have the max number of + # subtransactions in the in-memory subxid cache of each top transaction, + # until they all overflow. + # + # Currently, PGPROC_MAX_CACHED_SUBXIDS == 64, so this will overflow the all + # the subxid caches after creating 64 subxids in each top transaction. The + # point just before the caches have overflowed is the most interesting point + # in time, but we'll keep going beyond that, to ensure that this test is + # robust even if PGPROC_MAX_CACHED_SUBXIDS changes. + p_curs = [] + for _ in range(0, n_connections): + p_cur = primary.connect().cursor() + p_cur.execute("begin") + p_curs.append(p_cur) + + for _subxid in range(0, n_subxids): + for i in range(0, n_connections): + p_curs[i].execute("select create_subxacts(1)") + + # Commit all the transactions in the primary + for i in range(0, n_connections): + p_curs[i].execute("commit") + large_p_cur.execute("commit") + + # Wait until the replica crashes with "too many KnownAssignedXids" error. + def check_replica_crashed(): + try: + secondary.connect() + except psycopg2.Error: + # Once the connection fails, return success + return None + raise RuntimeError("connection succeeded") + + wait_until(20, 0.5, check_replica_crashed) + assert secondary.log_contains("too many KnownAssignedXids") + + # Replica is crashed, so ignore stop result + secondary.check_stop_result = False + + +def test_replica_start_repro_visibility_bug(neon_simple_env: NeonEnv): + """ + Before PR #7288, a hot standby in neon incorrectly started up + immediately, before it had received a running-xacts record. That + led to visibility bugs if there were active transactions in the + primary. This test reproduces the incorrect query results and + incorrectly set hint bits, before that was fixed. + """ + env = neon_simple_env + + primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary") + p_cur = primary.connect().cursor() + + p_cur.execute("begin") + p_cur.execute("create table t(pk integer primary key, payload integer)") + p_cur.execute("insert into t values (generate_series(1,100000), 0)") + + secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") + wait_replica_caughtup(primary, secondary) + s_cur = secondary.connect().cursor() + + # Set hint bits for pg_class tuples. If primary's transaction is + # not marked as in-progress in MVCC snapshot, then XMIN_INVALID + # hint bit will be set for table's 't' tuple, making it invisible + # even after the commit record is replayed later. + s_cur.execute("select * from pg_class") + + p_cur.execute("commit") + wait_replica_caughtup(primary, secondary) + s_cur.execute("select * from t where pk = 1") + assert s_cur.fetchone() == (1, 0) + + +@pytest.mark.parametrize("shutdown", [True, False]) +def test_replica_start_with_prepared_xacts(neon_simple_env: NeonEnv, shutdown: bool): + """ + Test the CLOG-scanning mechanism at hot standby startup in the presence of + prepared transactions. + + This test is run in two variants: one where the primary server is shut down + before starting the secondary, or not. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"] + ) + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("CREATE EXTENSION neon_test_utils") + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute("create table t1(pk integer primary key)") + primary_cur.execute("create table t2(pk integer primary key)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Prepare a transaction for two-phase commit + primary_cur.execute("begin") + primary_cur.execute("insert into t1 values (1)") + primary_cur.execute("prepare transaction 't1'") + + # Prepare another transaction for two-phase commit, with a subtransaction + primary_cur.execute("begin") + primary_cur.execute("insert into t2 values (2)") + primary_cur.execute("savepoint sp") + primary_cur.execute("insert into t2 values (3)") + primary_cur.execute("prepare transaction 't2'") + + # Start a transaction in the primary. Leave the transaction open. + # + # The transaction has some subtransactions, but not too many to cause the + # CLOG-scanning mechanism to give up. + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(50)") + + # Wait for the WAL to be flushed + primary_cur.execute("select neon_xlogflush()") + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + + if shutdown: + primary.stop(mode="fast") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. + secondary = env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"] + ) + + # The transaction did not commit, so it should not be visible in the secondary + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (0,) + secondary_cur.execute("select count(*) from t1") + assert secondary_cur.fetchone() == (0,) + secondary_cur.execute("select count(*) from t2") + assert secondary_cur.fetchone() == (0,) + + if shutdown: + primary.start() + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + else: + primary_cur.execute("commit") + primary_cur.execute("commit prepared 't1'") + primary_cur.execute("commit prepared 't2'") + + wait_replica_caughtup(primary, secondary) + + secondary_cur.execute("select count(*) from t") + if shutdown: + assert secondary_cur.fetchone() == (0,) + else: + assert secondary_cur.fetchone() == (50,) + secondary_cur.execute("select * from t1") + assert secondary_cur.fetchall() == [(1,)] + secondary_cur.execute("select * from t2") + assert secondary_cur.fetchall() == [(2,), (3,)] + + +def test_replica_start_with_prepared_xacts_with_subxacts(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup in the presence of + prepared transactions, with subtransactions. + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"] + ) + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + + # Install extension containing function needed for test + primary_cur.execute("CREATE EXTENSION neon_test_utils") + + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Advance nextXid close to the beginning of the next pg_subtrans segment (2^16 XIDs) + # + # This is interesting, because it tests that pg_subtrans is initialized correctly + # at standby startup. (We had a bug where it didn't at one point during development.) + while True: + xid = int(query_scalar(primary_cur, "SELECT txid_current()")) + log.info(f"xid now {xid}") + # Consume 500 transactions at a time until we get close + if xid < 65535 - 600: + primary_cur.execute("select test_consume_xids(500);") + else: + break + primary_cur.execute("checkpoint") + + # Prepare a transaction for two-phase commit + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(1000)") + primary_cur.execute("prepare transaction 't1'") + + # Wait for the WAL to be flushed, and stop the primary + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + primary.stop(mode="fast") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. + secondary = env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"] + ) + + # The transaction did not commit, so it should not be visible in the secondary + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (0,) + + primary.start() + + # Open a lot of subtransactions in the primary, causing the subxids cache to overflow + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("select create_subxacts(100000)") + + wait_replica_caughtup(primary, secondary) + + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100000,) + + primary_cur.execute("commit prepared 't1'") + + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (101000,) + + +def test_replica_start_with_prepared_xacts_with_many_subxacts(neon_simple_env: NeonEnv): + """ + Test the CLOG-scanning mechanism at hot standby startup in the presence of + prepared transactions, with lots of subtransactions. + + Like test_replica_start_with_prepared_xacts_with_subxacts, but with more + subxacts, to test that the prepared transaction's subxids don't consume + space in the known-assigned XIDs array. (They are set in pg_subtrans + instead) + """ + + # Initialize the primary, a test table, and a helper function to create lots + # of subtransactions. + env = neon_simple_env + primary = env.endpoints.create_start( + branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"] + ) + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + + # Install extension containing function needed for test + primary_cur.execute("CREATE EXTENSION neon_test_utils") + + primary_cur.execute("create table t(pk serial primary key, payload integer)") + primary_cur.execute(CREATE_SUBXACTS_FUNC) + + # Prepare a transaction for two-phase commit, with lots of subxids + primary_cur.execute("begin") + primary_cur.execute("select create_subxacts(50000)") + + # to make things a bit more varied, intersperse a few other XIDs in between + # the prepared transaction's sub-XIDs + with primary.connect().cursor() as primary_cur2: + primary_cur2.execute("insert into t (payload) values (123)") + primary_cur2.execute("begin; insert into t (payload) values (-1); rollback") + + primary_cur.execute("select create_subxacts(50000)") + primary_cur.execute("prepare transaction 't1'") + + # Wait for the WAL to be flushed + wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline) + + primary.stop(mode="fast") + + # Create a replica. It should start up normally, thanks to the CLOG-scanning + # mechanism. + secondary = env.endpoints.new_replica_start( + origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"] + ) + + # The transaction did not commit, so it should not be visible in the secondary + secondary_conn = secondary.connect() + secondary_cur = secondary_conn.cursor() + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (1,) + + primary.start() + + # Open a lot of subtransactions in the primary, causing the subxids cache to overflow + primary_conn = primary.connect() + primary_cur = primary_conn.cursor() + primary_cur.execute("select create_subxacts(100000)") + + wait_replica_caughtup(primary, secondary) + + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (100001,) + + primary_cur.execute("commit prepared 't1'") + + wait_replica_caughtup(primary, secondary) + secondary_cur.execute("select count(*) from t") + assert secondary_cur.fetchone() == (200001,) diff --git a/test_runner/regress/test_replication_start.py b/test_runner/regress/test_replication_start.py deleted file mode 100644 index 236074599021..000000000000 --- a/test_runner/regress/test_replication_start.py +++ /dev/null @@ -1,32 +0,0 @@ -import pytest -from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup - - -@pytest.mark.xfail -def test_replication_start(neon_simple_env: NeonEnv): - env = neon_simple_env - - with env.endpoints.create_start(branch_name="main", endpoint_id="primary") as primary: - with primary.connect() as p_con: - with p_con.cursor() as p_cur: - p_cur.execute("begin") - p_cur.execute("create table t(pk integer primary key, payload integer)") - p_cur.execute("insert into t values (generate_series(1,100000), 0)") - p_cur.execute("select txid_current()") - xid = p_cur.fetchall()[0][0] - log.info(f"Master transaction {xid}") - with env.endpoints.new_replica_start( - origin=primary, endpoint_id="secondary" - ) as secondary: - wait_replica_caughtup(primary, secondary) - with secondary.connect() as s_con: - with s_con.cursor() as s_cur: - # Enforce setting hint bits for pg_class tuples. - # If master's transaction is not marked as in-progress in MVCC snapshot, - # then XMIN_INVALID hint bit will be set for table's 't' tuple makeing it invisible. - s_cur.execute("select * from pg_class") - p_cur.execute("commit") - wait_replica_caughtup(primary, secondary) - s_cur.execute("select * from t where pk = 1") - assert s_cur.fetchone() == (1, 0) diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 62a9f422ee4d..4471237900b8 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -225,6 +225,12 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: workload.validate() workload.stop() + # Do a full image layer generation before splitting, so that when we compact after splitting + # we should only see sizes decrease (from post-split drops/rewrites), not increase (from image layer generation) + env.get_tenant_pageserver(tenant_id).http_client().timeline_compact( + tenant_id, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True + ) + # Split one shard into two shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2) @@ -542,6 +548,13 @@ def check_effective_tenant_config(): for k, v in non_default_tenant_config.items(): assert config.effective_config[k] == v + # Check that heatmap uploads remain enabled after shard split + # (https://github.com/neondatabase/neon/issues/8189) + assert ( + config.effective_config["heatmap_period"] + and config.effective_config["heatmap_period"] != "0s" + ) + # Validate pageserver state: expect every child shard to have an attached and secondary location (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id]) assert sum(attached.values()) == split_shard_count @@ -1137,10 +1150,6 @@ def test_sharding_split_failures( ) for ps in env.pageservers: - # When we do node failures and abandon a shard, it will de-facto have old generation and - # thereby be unable to publish remote consistent LSN updates - ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - # If we're using a failure that will panic the storage controller, all background # upcalls from the pageserver can fail ps.allowed_errors.append(".*calling control plane generation validation API failed.*") diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 9cc13ecfdbca..d37f7aae3dfd 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -60,11 +60,6 @@ def test_storage_controller_smoke( neon_env_builder.num_pageservers = 3 env = neon_env_builder.init_configs() - for pageserver in env.pageservers: - # This test detaches tenants during migration, which can race with deletion queue operations, - # during detach we only do an advisory flush, we don't wait for it. - pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"]) - # Start services by hand so that we can skip a pageserver (this will start + register later) env.broker.try_start() env.storage_controller.start() @@ -315,7 +310,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up # Create a tenant directly via pageserver HTTP API, skipping the storage controller tenant_id = TenantId.generate() generation = 123 - origin_ps.http_client().tenant_create(tenant_id, generation=generation) + origin_ps.tenant_create(tenant_id, generation=generation) # As if doing a live migration, first configure origin into stale mode r = origin_ps.http_client().tenant_location_conf( @@ -484,9 +479,6 @@ def handler(request: Request): # Start running env = neon_env_builder.init_start() - # We will to an unclean migration, which will result in deletion queue warnings - env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*") - # Initial notification from tenant creation assert len(notifications) == 1 expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = { @@ -1054,13 +1046,6 @@ def tenants_placed(): online_node_ids = set(range(1, len(env.pageservers) + 1)) - offline_node_ids for node_id in offline_node_ids: - env.get_pageserver(node_id).allowed_errors.append( - # In the case of the failpoint failure, the impacted pageserver - # still believes it has the tenant attached since location - # config calls into it will fail due to being marked offline. - ".*Dropped remote consistent LSN updates.*", - ) - if len(offline_node_ids) > 1: env.get_pageserver(node_id).allowed_errors.append( ".*Scheduling error when marking pageserver.*offline.*", @@ -1518,49 +1503,6 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto workload.validate() -def retryable_node_operation(op, ps_id, max_attempts, backoff): - while max_attempts > 0: - try: - op(ps_id) - return - except StorageControllerApiException as e: - max_attempts -= 1 - log.info(f"Operation failed ({max_attempts} attempts left): {e}") - - if max_attempts == 0: - raise e - - time.sleep(backoff) - - -def poll_node_status(env, node_id, desired_scheduling_policy, max_attempts, backoff): - log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy") - while max_attempts > 0: - try: - status = env.storage_controller.node_status(node_id) - policy = status["scheduling"] - if policy == desired_scheduling_policy: - return - else: - max_attempts -= 1 - log.info(f"Status call returned {policy=} ({max_attempts} attempts left)") - - if max_attempts == 0: - raise AssertionError( - f"Status for {node_id=} did not reach {desired_scheduling_policy=}" - ) - - time.sleep(backoff) - except StorageControllerApiException as e: - max_attempts -= 1 - log.info(f"Status call failed ({max_attempts} retries left): {e}") - - if max_attempts == 0: - raise e - - time.sleep(backoff) - - def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): """ Graceful reststart of storage controller clusters use the drain and @@ -1601,10 +1543,10 @@ def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards): # Perform a graceful rolling restart for ps in env.pageservers: - retryable_node_operation( + env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2 ) - poll_node_status(env, ps.id, "PauseForRestart", max_attempts=6, backoff=5) + env.storage_controller.poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5) shard_counts = get_node_shard_counts(env, tenant_ids) log.info(f"Shard counts after draining node {ps.id}: {shard_counts}") @@ -1614,12 +1556,12 @@ def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards): assert sum(shard_counts.values()) == total_shards ps.restart() - poll_node_status(env, ps.id, "Active", max_attempts=10, backoff=1) + env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=10, backoff=1) - retryable_node_operation( + env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2 ) - poll_node_status(env, ps.id, "Active", max_attempts=6, backoff=5) + env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=6, backoff=5) shard_counts = get_node_shard_counts(env, tenant_ids) log.info(f"Shard counts after filling node {ps.id}: {shard_counts}") @@ -1636,7 +1578,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_configs() env.start() - tenant_count = 5 + tenant_count = 10 shard_count_per_tenant = 8 tenant_ids = [] @@ -1657,15 +1599,15 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): ps_id_to_drain = env.pageservers[0].id - retryable_node_operation( + env.storage_controller.retryable_node_operation( lambda ps_id: env.storage_controller.node_drain(ps_id), ps_id_to_drain, max_attempts=3, backoff=2, ) - poll_node_status(env, ps_id_to_drain, "Draining", max_attempts=6, backoff=2) + env.storage_controller.poll_node_status(ps_id_to_drain, "Draining", max_attempts=6, backoff=2) env.storage_controller.cancel_node_drain(ps_id_to_drain) - poll_node_status(env, ps_id_to_drain, "Active", max_attempts=6, backoff=2) + env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2) diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index d7f396262059..91caad722051 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -54,4 +54,4 @@ def insert_data(pub): pcur.execute(f"INSERT into t values ({n_records}, 0)") n_records += 1 with sub.cursor() as scur: - wait_until(10, 0.5, check_that_changes_propagated) + wait_until(60, 0.5, check_that_changes_propagated) diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 2cbb036c0d7c..80fb2b55b8b2 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -320,10 +320,6 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): assert not config_path.exists(), "detach did not remove config file" - # The re-attach's increment of the generation number may invalidate deletion queue - # updates in flight from the previous attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - env.pageserver.tenant_attach(tenant_id) wait_until( number_of_iterations=5, diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index d3fba32a19e0..1d7c8b8e31f0 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -67,8 +67,9 @@ def test_tenant_delete_smoke( # first try to delete non existing tenant tenant_id = TenantId.generate() - env.pageserver.allowed_errors.append(".*NotFound.*") - env.pageserver.allowed_errors.append(".*simulated failure.*") + env.pageserver.allowed_errors.extend( + [".*NotFound.*", ".*simulated failure.*", ".*failed to delete .+ objects.*"] + ) # Check that deleting a non-existent tenant gives the expected result: this is a loop because we # may need to retry on some remote storage errors injected by the test harness diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 2056840558e6..b165588636c7 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -76,10 +76,6 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: with endpoint.cursor() as cur: cur.execute("CREATE TABLE t(key int primary key, value text)") @@ -349,10 +345,6 @@ def test_detach_while_attaching( env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - # Create table, and insert some rows. Make it big enough that it doesn't fit in # shared_buffers, otherwise the SELECT after restart will just return answer # from shared_buffers without hitting the page server, which defeats the point @@ -422,10 +414,6 @@ def test_detach_while_activating( env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) - # Our re-attach may race with the deletion queue processing LSN updates - # from the original attachment. - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*") - data_id = 1 data_secret = "very secret secret" insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 9fe732e28806..43e9a0d36e80 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -203,8 +203,6 @@ def test_tenant_relocation( [ # Needed for detach polling on the original pageserver f".*NotFound: tenant {tenant_id}.*", - # We will dual-attach in this test, so stale generations are expected - ".*Dropped remote consistent LSN updates.*", ] ) diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 6c85ddebbcfb..b1ade77a1474 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -10,6 +10,7 @@ Endpoint, NeonEnv, NeonEnvBuilder, + flush_ep_to_pageserver, wait_for_last_flush_lsn, wait_for_wal_insert_lsn, ) @@ -710,3 +711,115 @@ def mask_model_inputs(x): return newlist else: return x + + +@pytest.mark.parametrize("zero_gc", [True, False]) +def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, zero_gc: bool): + """ + Compare a LSN lease to a read-only branch for synthetic size calculation. + They should have the same effect. + """ + + def assert_size_approx_equal_for_lease_test(size_lease, size_branch): + """ + Tests that evaluate sizes are checking the pageserver space consumption + that sits many layers below the user input. The exact space needed + varies slightly depending on postgres behavior. + + Rather than expecting postgres to be determinstic and occasionally + failing the test, we permit sizes for the same data to vary by a few pages. + """ + + # FIXME(yuchen): The delta is too large, used as temp solution to pass the test reliably. + # Investigate and reduce the threshold. + threshold = 22 * 8272 + + log.info( + f"delta: size_branch({size_branch}) - size_lease({size_lease}) = {size_branch - size_lease}" + ) + + assert size_lease == pytest.approx(size_branch, abs=threshold) + + conf = { + "pitr_interval": "0s" if zero_gc else "3600s", + "gc_period": "0s", + "compaction_period": "0s", + } + + env = neon_env_builder.init_start(initial_tenant_conf=conf) + + ro_branch_res = insert_with_action( + env, env.initial_tenant, env.initial_timeline, test_output_dir, action="branch" + ) + + tenant, timeline = env.neon_cli.create_tenant(conf=conf) + lease_res = insert_with_action(env, tenant, timeline, test_output_dir, action="lease") + + assert_size_approx_equal_for_lease_test(lease_res, ro_branch_res) + + +def insert_with_action( + env: NeonEnv, + tenant: TenantId, + timeline: TimelineId, + test_output_dir: Path, + action: str, +) -> int: + """ + Inserts some data on the timeline, perform an action, and insert more data on the same timeline. + Returns the size at the end of the insertion. + + Valid actions: + - "lease": Acquires a lease. + - "branch": Creates a child branch but never writes to it. + """ + + client = env.pageserver.http_client() + with env.endpoints.create_start( + "main", + tenant_id=tenant, + config_lines=["autovacuum=off"], + ) as ep: + initial_size = client.tenant_size(tenant) + log.info(f"initial size: {initial_size}") + + with ep.cursor() as cur: + cur.execute( + "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline) + + if action == "lease": + res = client.timeline_lsn_lease(tenant, timeline, last_flush_lsn) + log.info(f"result from lsn_lease api: {res}") + elif action == "branch": + ro_branch = env.neon_cli.create_branch( + "ro_branch", tenant_id=tenant, ancestor_start_lsn=last_flush_lsn + ) + log.info(f"{ro_branch=} created") + else: + raise AssertionError("Invalid action type, only `lease` and `branch`are accepted") + + with ep.cursor() as cur: + cur.execute( + "CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + cur.execute( + "CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + cur.execute( + "CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)" + ) + + last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline) + + # Avoid flakiness when calculating logical size. + flush_ep_to_pageserver(env, ep, tenant, timeline) + + size_after_action_and_insert = client.tenant_size(tenant) + log.info(f"{size_after_action_and_insert=}") + + size_debug_file = open(test_output_dir / f"size_debug_{action}.html", "w") + size_debug = client.tenant_size_debug(tenant) + size_debug_file.write(size_debug) + return size_after_action_and_insert diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index 93e9ad367367..04b3fdd80fa5 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -41,18 +41,35 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): neon_simple_env.storage_controller.allowed_errors.extend(error_regexes) pageserver_http = neon_simple_env.pageserver.http_client() + + # Failure to write a config to local disk makes the pageserver assume that local disk is bad and abort the process pageserver_http.configure_failpoints(("tenant-config-before-write", "return")) - with pytest.raises(Exception, match="tenant-config-before-write"): + + # Storage controller will see a torn TCP connection when the crash point is reached, and follow an unclean 500 error path + neon_simple_env.storage_controller.allowed_errors.extend( + [ + ".*Reconcile not done yet while creating tenant.*", + ".*Reconcile error: receive body: error sending request.*", + ".*Error processing HTTP request: InternalServerError.*", + ] + ) + + with pytest.raises(Exception, match="error sending request"): _ = neon_simple_env.neon_cli.create_tenant() + # Any files left behind on disk during failed creation do not prevent + # a retry from succeeding. Restart pageserver with no failpoints. + neon_simple_env.pageserver.running = False + neon_simple_env.pageserver.start() + + # The failed creation should not be present in list of tenants, as when we start up we'll see + # an empty tenant dir with no config in it. + neon_simple_env.pageserver.allowed_errors.append(".*Failed to load tenant config.*") new_tenants = sorted( map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) ) assert initial_tenants == new_tenants, "should not create new tenants" - # Any files left behind on disk during failed creation do not prevent - # a retry from succeeding. - pageserver_http.configure_failpoints(("tenant-config-before-write", "off")) neon_simple_env.neon_cli.create_tenant() @@ -369,10 +386,6 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder): # generation nubmers out of order. env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+") - # Our multiple creation requests will advance generation quickly, and when we skip - # a generation number we can generate these warnings - env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+") - # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of # an incomplete attach, or some other problem. In the field this should be rare, # so we allow it to log at WARN, even if it is occasionally a false positive. diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index f47356839c26..5e9a42f6b41e 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -152,10 +152,12 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder): client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) + size_limit_mb = 30 + endpoint_main = env.endpoints.create( "test_timeline_size_quota_on_startup", # Set small limit for the test - config_lines=["neon.max_cluster_size=30MB"], + config_lines=[f"neon.max_cluster_size={size_limit_mb}MB"], ) endpoint_main.start() @@ -165,17 +167,39 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder): # Insert many rows. This query must fail because of space limit try: - for _i in range(5000): - cur.execute( - """ - INSERT INTO foo - SELECT 'long string to consume some space' || g - FROM generate_series(1, 100) g - """ - ) - # If we get here, the timeline size limit failed - log.error("Query unexpectedly succeeded") + def write_rows(count): + for _i in range(count): + cur.execute( + """ + INSERT INTO foo + SELECT 'long string to consume some space' || g + FROM generate_series(1, 100) g + """ + ) + + # Write some data that exceeds limit, then let the pageserver ingest it to guarantee that some feedback has made it to + # the safekeeper, then try to write some more. We expect either the initial writes or the ones after + # the wait_for_last_flush_lsn to generate an exception. + # + # Without the wait_for_last_flush_lsn, the size limit sometimes isn't enforced (see https://github.com/neondatabase/neon/issues/6562) + write_rows(2500) + wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id) + logical_size = env.pageserver.http_client().timeline_detail( + env.initial_tenant, new_timeline_id + )["current_logical_size"] + assert logical_size > size_limit_mb * 1024 * 1024 + write_rows(2500) + + # If we get here, the timeline size limit failed. Find out from the pageserver how large it + # thinks the timeline is. + wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id) + logical_size = env.pageserver.http_client().timeline_detail( + env.initial_tenant, new_timeline_id + )["current_logical_size"] + log.error( + f"Query unexpectedly succeeded, pageserver logical size is {logical_size}" + ) raise AssertionError() except psycopg2.errors.DiskFull as err: diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index ac1a3bef67bd..7efd86e3497d 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -1725,7 +1725,10 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # Basic pull_timeline test. -def test_pull_timeline(neon_env_builder: NeonEnvBuilder): +# When live_sk_change is False, compute is restarted to change set of +# safekeepers; otherwise it is live reload. +@pytest.mark.parametrize("live_sk_change", [False, True]) +def test_pull_timeline(neon_env_builder: NeonEnvBuilder, live_sk_change: bool): neon_env_builder.auth_enabled = True def execute_payload(endpoint: Endpoint): @@ -1758,8 +1761,7 @@ def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_i log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() endpoint = env.endpoints.create("main") - endpoint.active_safekeepers = [1, 2, 3] - endpoint.start() + endpoint.start(safekeepers=[1, 2, 3]) execute_payload(endpoint) show_statuses(env.safekeepers, tenant_id, timeline_id) @@ -1771,29 +1773,22 @@ def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_i log.info("Initialize new safekeeper 4, pull data from 1 & 3") env.safekeepers[3].start() - res = ( - env.safekeepers[3] - .http_client(auth_token=env.auth_keys.generate_safekeeper_token()) - .pull_timeline( - { - "tenant_id": str(tenant_id), - "timeline_id": str(timeline_id), - "http_hosts": [ - f"http://localhost:{env.safekeepers[0].port.http}", - f"http://localhost:{env.safekeepers[2].port.http}", - ], - } - ) + res = env.safekeepers[3].pull_timeline( + [env.safekeepers[0], env.safekeepers[2]], tenant_id, timeline_id ) log.info("Finished pulling timeline") log.info(res) show_statuses(env.safekeepers, tenant_id, timeline_id) - log.info("Restarting compute with new config to verify that it works") - endpoint.stop_and_destroy().create("main") - endpoint.active_safekeepers = [1, 3, 4] - endpoint.start() + action = "reconfiguing" if live_sk_change else "restarting" + log.info(f"{action} compute with new config to verify that it works") + new_sks = [1, 3, 4] + if not live_sk_change: + endpoint.stop_and_destroy().create("main") + endpoint.start(safekeepers=new_sks) + else: + endpoint.reconfigure(safekeepers=new_sks) execute_payload(endpoint) show_statuses(env.safekeepers, tenant_id, timeline_id) @@ -2196,24 +2191,25 @@ def test_s3_eviction( ): neon_env_builder.num_safekeepers = 3 neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) - env = neon_env_builder.init_start( - initial_tenant_conf={ - "checkpoint_timeout": "100ms", - } - ) - extra_opts = [ + neon_env_builder.safekeeper_extra_opts = [ "--enable-offload", "--partial-backup-timeout", "50ms", "--control-file-save-interval", "1s", + # Safekeepers usually wait a while before evicting something: for this test we want them to + # evict things as soon as they are inactive. + "--eviction-min-resident=100ms", ] if delete_offloaded_wal: - extra_opts.append("--delete-offloaded-wal") + neon_env_builder.safekeeper_extra_opts.append("--delete-offloaded-wal") - for sk in env.safekeepers: - sk.stop().start(extra_opts=extra_opts) + env = neon_env_builder.init_start( + initial_tenant_conf={ + "checkpoint_timeout": "100ms", + } + ) n_timelines = 5 @@ -2268,7 +2264,7 @@ def test_s3_eviction( # restarting random safekeepers for sk in env.safekeepers: if random.random() < restart_chance: - sk.stop().start(extra_opts=extra_opts) + sk.stop().start() time.sleep(0.5) # require at least one successful eviction in at least one safekeeper diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index 7845c122d51d..ad73770c446e 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit 7845c122d51d3ebb547a984a640ac0310a2fadce +Subproject commit ad73770c446ea361f43e4f0404798b7e5e7a62d8 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 2ff5ecc67c64..4874c8e52ed3 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 2ff5ecc67c64e5fe44b7dde598e64e4538e0c373 +Subproject commit 4874c8e52ed349a9f8290bbdcd91eb92677a5d24 diff --git a/vendor/postgres-v16 b/vendor/postgres-v16 index d55e0aca104a..b810fdfcbb59 160000 --- a/vendor/postgres-v16 +++ b/vendor/postgres-v16 @@ -1 +1 @@ -Subproject commit d55e0aca104af0b611cf5565f1033b2acd2dcc1c +Subproject commit b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2 diff --git a/vendor/revisions.json b/vendor/revisions.json index e755cf2e9dfa..da49ff19c3ec 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,5 +1,5 @@ { - "v16": ["16.3", "d55e0aca104af0b611cf5565f1033b2acd2dcc1c"], - "v15": ["15.7", "2ff5ecc67c64e5fe44b7dde598e64e4538e0c373"], - "v14": ["14.12", "7845c122d51d3ebb547a984a640ac0310a2fadce"] + "v16": ["16.3", "b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2"], + "v15": ["15.7", "4874c8e52ed349a9f8290bbdcd91eb92677a5d24"], + "v14": ["14.12", "ad73770c446ea361f43e4f0404798b7e5e7a62d8"] } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index f43076171f21..832fe06bf697 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -30,13 +30,12 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd clap = { version = "4", features = ["derive", "string"] } clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] } crossbeam-utils = { version = "0.8" } +deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] } either = { version = "1" } fail = { version = "0.5", default-features = false, features = ["failpoints"] } futures-channel = { version = "0.3", features = ["sink"] } -futures-core = { version = "0.3" } futures-executor = { version = "0.3" } futures-io = { version = "0.3" } -futures-sink = { version = "0.3" } futures-util = { version = "0.3", features = ["channel", "io", "sink"] } getrandom = { version = "0.2", default-features = false, features = ["std"] } hashbrown = { version = "0.14", features = ["raw"] } @@ -69,6 +68,7 @@ sha2 = { version = "0.10", features = ["asm"] } smallvec = { version = "1", default-features = false, features = ["const_new", "write"] } subtle = { version = "2" } sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] } +tikv-jemalloc-sys = { version = "0.5" } time = { version = "0.3", features = ["macros", "serde-well-known"] } tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] } tokio-rustls = { version = "0.24" } @@ -106,7 +106,9 @@ num-integer = { version = "0.1", features = ["i128"] } num-traits = { version = "0.2", features = ["i128", "libm"] } once_cell = { version = "1" } parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] } +proc-macro2 = { version = "1" } prost = { version = "0.11" } +quote = { version = "1" } regex = { version = "1" } regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] } regex-syntax = { version = "0.8" }