diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index f84beff20c52..11adc8df86ec 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -183,7 +183,7 @@ runs:
       uses: actions/cache@v4
       with:
         path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
 
     - name: Store Allure test stat in the DB (new)
       if: ${{ !cancelled() && inputs.store-test-results-into-db == 'true' }}
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index d5c1fcf524de..daaedf6d11d2 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -56,14 +56,14 @@ runs:
       if: inputs.build_type != 'remote'
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
         path: /tmp/neon
 
     - name: Download Neon binaries for the previous release
       if: inputs.build_type != 'remote'
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-${{ inputs.build_type }}-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
         path: /tmp/neon-previous
         prefix: latest
 
@@ -89,7 +89,7 @@ runs:
       uses: actions/cache@v4
       with:
         path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+        key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
 
     - name: Install Python deps
       shell: bash -euxo pipefail {0}
@@ -114,6 +114,8 @@ runs:
         export PLATFORM=${PLATFORM:-github-actions-selfhosted}
         export POSTGRES_DISTRIB_DIR=${POSTGRES_DISTRIB_DIR:-/tmp/neon/pg_install}
         export DEFAULT_PG_VERSION=${PG_VERSION#v}
+        export LD_LIBRARY_PATH=${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/lib
+        export BENCHMARK_CONNSTR=${BENCHMARK_CONNSTR:-}
 
         if [ "${BUILD_TYPE}" = "remote" ]; then
           export REMOTE_ENV=1
@@ -178,7 +180,15 @@ runs:
 
         # Wake up the cluster if we use remote neon instance
         if [ "${{ inputs.build_type }}" = "remote" ] && [ -n "${BENCHMARK_CONNSTR}" ]; then
-          ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "SELECT version();"
+          QUERIES=("SELECT version()")
+          if [[ "${PLATFORM}" = "neon"* ]]; then
+            QUERIES+=("SHOW neon.tenant_id")
+            QUERIES+=("SHOW neon.timeline_id")
+          fi
+
+          for q in "${QUERIES[@]}"; do
+            ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/psql ${BENCHMARK_CONNSTR} -c "${q}"
+          done
         fi
 
         # Run the tests.
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 9eff4836809d..899cae2b8658 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -77,7 +77,7 @@ jobs:
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
 
@@ -99,7 +99,14 @@ jobs:
         # Set --sparse-ordering option of pytest-order plugin
         # to ensure tests are running in order of appears in the file.
         # It's important for test_perf_pgbench.py::test_pgbench_remote_* tests
-        extra_params: -m remote_cluster --sparse-ordering --timeout 5400 --ignore test_runner/performance/test_perf_olap.py --ignore test_runner/performance/test_perf_pgvector_queries.py
+        extra_params:
+          -m remote_cluster
+          --sparse-ordering
+          --timeout 5400
+          --ignore test_runner/performance/test_perf_olap.py
+          --ignore test_runner/performance/test_perf_pgvector_queries.py
+          --ignore test_runner/performance/test_logical_replication.py
+          --ignore test_runner/performance/test_physical_replication.py
       env:
         BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -125,6 +132,69 @@ jobs:
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
+  replication-tests:
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 14
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:pinned
+      options: --init
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Run benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_logical_replication.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Run benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_physical_replication.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic replication testing: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
   generate-matrices:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
@@ -235,15 +305,10 @@ jobs:
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Create Neon Project
       if: contains(fromJson('["neon-captest-new", "neon-captest-freetier", "neonvm-captest-new", "neonvm-captest-freetier"]'), matrix.platform)
       id: create-neon-project
@@ -282,16 +347,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
     - name: Benchmark init
       uses: ./.github/actions/run-python-test-set
       with:
@@ -373,29 +428,16 @@ jobs:
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Set up Connection String
       id: set-up-connstr
       run: |
         CONNSTR=${{ secrets.BENCHMARK_PGVECTOR_CONNSTR }}
-        
-        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        QUERIES+=("SHOW neon.tenant_id")
-        QUERIES+=("SHOW neon.timeline_id")
-        
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
     - name: Benchmark pgvector hnsw indexing
       uses: ./.github/actions/run-python-test-set
@@ -417,12 +459,12 @@ jobs:
         test_selection: performance/test_perf_pgvector_queries.py
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 
+        extra_params: -m remote_cluster --timeout 21600
       env:
         BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
         PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
-    
+
     - name: Create Allure report
       if: ${{ !cancelled() }}
       uses: ./.github/actions/allure-report-generate
@@ -473,15 +515,10 @@ jobs:
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Set up Connection String
       id: set-up-connstr
       run: |
@@ -503,16 +540,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
     - name: ClickBench benchmark
       uses: ./.github/actions/run-python-test-set
       with:
@@ -576,15 +603,10 @@ jobs:
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Get Connstring Secret Name
       run: |
         case "${PLATFORM}" in
@@ -613,16 +635,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
     - name: Run TPC-H benchmark
       uses: ./.github/actions/run-python-test-set
       with:
@@ -677,15 +689,10 @@ jobs:
     - name: Download Neon artifact
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-release-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
         path: /tmp/neon/
         prefix: latest
 
-    - name: Add Postgres binaries to PATH
-      run: |
-        ${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin/pgbench --version
-        echo "${POSTGRES_DISTRIB_DIR}/v${DEFAULT_PG_VERSION}/bin" >> $GITHUB_PATH
-
     - name: Set up Connection String
       id: set-up-connstr
       run: |
@@ -707,16 +714,6 @@ jobs:
 
         echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
 
-        QUERIES=("SELECT version()")
-        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
-        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
-
     - name: Run user examples
       uses: ./.github/actions/run-python-test-set
       with:
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 5a94dd8e6f2d..a69686bf2a6e 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -63,14 +63,16 @@ jobs:
           mkdir -p /tmp/.docker-custom
           echo DOCKER_CONFIG=/tmp/.docker-custom >> $GITHUB_ENV
 
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
+        with:
+          cache-binary: false
 
-      - uses: docker/login-action@v2
+      - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - uses: docker/build-push-action@v4
+      - uses: docker/build-push-action@v6
         with:
           context: .
           provenance: false
@@ -82,6 +84,7 @@ jobs:
           tags: neondatabase/build-tools:${{ inputs.image-tag }}-${{ matrix.arch }}
 
       - name: Remove custom docker config directory
+        if: always()
         run: |
           rm -rf /tmp/.docker-custom
 
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 87f04996fd81..cb7655e03908 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -30,7 +30,7 @@ jobs:
     if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
     uses: ./.github/workflows/check-permissions.yml
     with:
-      github-event-name: ${{ github.event_name}}
+      github-event-name: ${{ github.event_name }}
 
   cancel-previous-e2e-tests:
     needs: [ check-permissions ]
@@ -109,7 +109,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ~/.cache/pypoetry/virtualenvs
-          key: v2-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
 
       - name: Install Python deps
         run: ./scripts/pysync
@@ -149,7 +149,7 @@ jobs:
 #            !~/.cargo/registry/src
 #            ~/.cargo/git/
 #            target/
-#          key: v1-${{ runner.os }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-clippy-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
 
       # Some of our rust modules use FFI and need those to be checked
       - name: Get postgres headers
@@ -291,29 +291,29 @@ jobs:
 #            target/
 #          # Fall back to older versions of the key, if no cache for current Cargo.lock was found
 #          key: |
-#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
-#            v1-${{ runner.os }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
+#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-${{ hashFiles('Cargo.lock') }}
+#            v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-cargo-${{ hashFiles('rust-toolchain.toml') }}-
 
       - name: Cache postgres v14 build
         id: cache_pg_14
         uses: actions/cache@v4
         with:
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
         uses: actions/cache@v4
         with:
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
         uses: actions/cache@v4
         with:
           path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
@@ -335,6 +335,8 @@ jobs:
 
       - name: Run cargo build
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
           ${cov_prefix} mold -run cargo build $CARGO_FLAGS $CARGO_FEATURES --bins --tests
 
       # Do install *before* running rust tests because they might recompile the
@@ -383,6 +385,11 @@ jobs:
         env:
           NEXTEST_RETRIES: 3
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
           #nextest does not yet support running doctests
           cargo test --doc $CARGO_FLAGS $CARGO_FEATURES
 
@@ -411,7 +418,7 @@ jobs:
       - name: Upload Neon artifact
         uses: ./.github/actions/upload
         with:
-          name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
+          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
           path: /tmp/neon
 
       # XXX: keep this after the binaries.list is formed, so the coverage can properly work later
@@ -490,7 +497,7 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ~/.cache/pypoetry/virtualenvs
-          key: v1-${{ runner.os }}-python-deps-${{ hashFiles('poetry.lock') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-python-deps-${{ hashFiles('poetry.lock') }}
 
       - name: Install Python deps
         run: ./scripts/pysync
@@ -639,7 +646,7 @@ jobs:
       - name: Get Neon artifact
         uses: ./.github/actions/download
         with:
-          name: neon-${{ runner.os }}-${{ matrix.build_type }}-artifact
+          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ matrix.build_type }}-artifact
           path: /tmp/neon
 
       - name: Get coverage artifact
@@ -744,14 +751,16 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
+        with:
+          cache-binary: false
 
       - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
           password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
 
-      - uses: docker/build-push-action@v5
+      - uses: docker/build-push-action@v6
         with:
           context: .
           build-args: |
@@ -822,11 +831,12 @@ jobs:
         run: |
           mkdir -p .docker-custom
           echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
         with:
+          cache-binary: false
           # Disable parallelism for docker buildkit.
           # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
-          config-inline: |
+          buildkitd-config-inline: |
             [worker.oci]
               max-parallelism = 1
 
@@ -842,7 +852,7 @@ jobs:
           password: ${{ secrets.AWS_SECRET_KEY_DEV }}
 
       - name: Build compute-node image
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           context: .
           build-args: |
@@ -861,7 +871,7 @@ jobs:
 
       - name: Build neon extensions test image
         if: matrix.version == 'v16'
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           context: .
           build-args: |
@@ -882,7 +892,7 @@ jobs:
       - name: Build compute-tools image
         # compute-tools are Postgres independent, so build it only once
         if: matrix.version == 'v16'
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
         with:
           target: compute-tools-image
           context: .
@@ -1326,6 +1336,7 @@ jobs:
         env:
           BUCKET: neon-github-public-dev
           PREFIX: artifacts/latest
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
         run: |
           # Update compatibility snapshot for the release
           for pg_version in v14 v15 v16; do
@@ -1339,8 +1350,8 @@ jobs:
 
           # Update Neon artifact for the release (reuse already uploaded artifact)
           for build_type in debug release; do
-            OLD_PREFIX=artifacts/${GITHUB_RUN_ID}
-            FILENAME=neon-${{ runner.os }}-${build_type}-artifact.tar.zst
+            OLD_PREFIX=artifacts/${COMMIT_SHA}/${GITHUB_RUN_ID}
+            FILENAME=neon-${{ runner.os }}-${{ runner.arch }}-${build_type}-artifact.tar.zst
 
             S3_KEY=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${OLD_PREFIX} | jq -r '.Contents[]?.Key' | grep ${FILENAME} | sort --version-sort | tail -1 || true)
             if [ -z "${S3_KEY}" ]; then
@@ -1358,3 +1369,31 @@ jobs:
     with:
       from-tag: ${{ needs.build-build-tools-image.outputs.image-tag }}
     secrets: inherit
+
+  # This job simplifies setting branch protection rules (in GitHub UI)
+  # by allowing to set only this job instead of listing many others.
+  # It also makes it easier to rename or parametrise jobs (using matrix)
+  # which requires changes in branch protection rules
+  #
+  # Note, that we can't add external check (like `neon-cloud-e2e`) we still need to use GitHub UI for that.
+  #
+  # https://github.com/neondatabase/neon/settings/branch_protection_rules
+  conclusion:
+    if: always()
+    # Format `needs` differently to make the list more readable.
+    # Usually we do `needs: [...]`
+    needs:
+      - check-codestyle-python
+      - check-codestyle-rust
+      - regress-tests
+      - test-images
+    runs-on: ubuntu-22.04
+    steps:
+      # The list of possible results:
+      # https://docs.github.com/en/actions/learn-github-actions/contexts#needs-context
+      - name: Fail the job if any of the dependencies do not succeed
+        run: exit 1
+        if: |
+          contains(needs.*.result, 'failure')
+          || contains(needs.*.result, 'cancelled')
+          || contains(needs.*.result, 'skipped')
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 7d2187e59cd5..11ff634b6c65 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -232,12 +232,19 @@ jobs:
 
       - name: Run cargo build
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
           mold -run cargo build --locked $CARGO_FLAGS $CARGO_FEATURES --bins --tests -j$(nproc)
 
       - name: Run cargo test
         env:
           NEXTEST_RETRIES: 3
         run: |
+          PQ_LIB_DIR=$(pwd)/pg_install/v16/lib
+          export PQ_LIB_DIR
+          LD_LIBRARY_PATH=$(pwd)/pg_install/v16/lib
+          export LD_LIBRARY_PATH
+
           cargo nextest run $CARGO_FEATURES -j$(nproc)
 
           # Run separate tests for real S3
@@ -378,7 +385,7 @@ jobs:
         run: make walproposer-lib -j$(nproc)
 
       - name: Produce the build stats
-        run: cargo build --all --release --timings -j$(nproc)
+        run: PQ_LIB_DIR=$(pwd)/pg_install/v16/lib cargo build --all --release --timings -j$(nproc)
 
       - name: Upload the build stats
         id: upload-stats
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
new file mode 100644
index 000000000000..ed4e6be71239
--- /dev/null
+++ b/.github/workflows/periodic_pagebench.yml
@@ -0,0 +1,155 @@
+name: Periodic pagebench performance test on dedicated EC2 machine in eu-central-1 region
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '0 18 * * *' # Runs at 6 PM UTC every day
+  workflow_dispatch: # Allows manual triggering of the workflow
+    inputs:
+      commit_hash:
+        type: string
+        description: 'The long neon repo commit hash for the system under test (pageserver) to be tested.'
+        required: false
+        default: ''
+
+defaults:
+  run:
+    shell: bash -euo pipefail {0}
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: false
+
+jobs:
+  trigger_bench_on_ec2_machine_in_eu_central_1:
+    runs-on: [ self-hosted, gen3, small ]
+    container:
+      image: neondatabase/build-tools:pinned
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+    timeout-minutes: 360  # Set the timeout to 6 hours
+    env:
+      API_KEY: ${{ secrets.PERIODIC_PAGEBENCH_EC2_RUNNER_API_KEY }}
+      RUN_ID: ${{ github.run_id }}
+      AWS_ACCESS_KEY_ID: ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_ID }}
+      AWS_SECRET_ACCESS_KEY : ${{ secrets.AWS_EC2_US_TEST_RUNNER_ACCESS_KEY_SECRET }}
+      AWS_DEFAULT_REGION : "eu-central-1"
+      AWS_INSTANCE_ID : "i-02a59a3bf86bc7e74"
+    steps:
+    # we don't need the neon source code because we run everything remotely
+    # however we still need the local github actions to run the allure step below
+    - uses: actions/checkout@v4
+
+    - name: Show my own (github runner) external IP address - usefull for IP allowlisting
+      run: curl https://ifconfig.me
+
+    - name: Start EC2 instance and wait for the instance to boot up
+      run: |
+        aws ec2 start-instances --instance-ids $AWS_INSTANCE_ID
+        aws ec2 wait instance-running --instance-ids $AWS_INSTANCE_ID
+        sleep 60 # sleep some time to allow cloudinit and our API server to start up
+
+    - name: Determine public IP of the EC2 instance and set env variable EC2_MACHINE_URL_US
+      run: |
+        public_ip=$(aws ec2 describe-instances --instance-ids $AWS_INSTANCE_ID --query 'Reservations[*].Instances[*].PublicIpAddress' --output text)
+        echo "Public IP of the EC2 instance: $public_ip"
+        echo "EC2_MACHINE_URL_US=https://${public_ip}:8443" >> $GITHUB_ENV
+
+    - name: Determine commit hash
+      env:
+        INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
+      run: |
+        if [ -z "$INPUT_COMMIT_HASH" ]; then
+          echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
+        else
+          echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
+        fi
+
+    - name: Start Bench with run_id   
+      run: |
+        curl -k -X 'POST' \
+        "${EC2_MACHINE_URL_US}/start_test/${GITHUB_RUN_ID}" \
+        -H 'accept: application/json' \
+        -H 'Content-Type: application/json' \
+        -H "Authorization: Bearer $API_KEY" \
+        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
+
+    - name: Poll Test Status
+      id: poll_step
+      run: |
+        status=""
+        while [[ "$status" != "failure" && "$status" != "success" ]]; do
+          response=$(curl -k -X 'GET' \
+          "${EC2_MACHINE_URL_US}/test_status/${GITHUB_RUN_ID}" \
+          -H 'accept: application/json' \
+          -H "Authorization: Bearer $API_KEY")
+          echo "Response: $response"
+          set +x
+          status=$(echo $response | jq -r '.status')
+          echo "Test status: $status"
+          if [[ "$status" == "failure" ]]; then
+            echo "Test failed"
+            exit 1 # Fail the job step if status is failure
+          elif [[ "$status" == "success" || "$status" == "null" ]]; then
+            break
+          elif [[ "$status" == "too_many_runs" ]]; then
+            echo "Too many runs already running"
+            echo "too_many_runs=true" >> "$GITHUB_OUTPUT"
+            exit 1
+          fi
+
+          sleep 60 # Poll every 60 seconds
+        done
+
+    - name: Retrieve Test Logs
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+      run: |
+        curl -k -X 'GET' \
+        "${EC2_MACHINE_URL_US}/test_log/${GITHUB_RUN_ID}" \
+        -H 'accept: application/gzip' \
+        -H "Authorization: Bearer $API_KEY" \
+        --output "test_log_${GITHUB_RUN_ID}.gz"
+    
+    - name: Unzip Test Log and Print it into this job's log
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+      run: |
+        gzip -d "test_log_${GITHUB_RUN_ID}.gz"
+        cat "test_log_${GITHUB_RUN_ID}"
+
+    - name: Create Allure report
+      env:
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C033QLM5P7D" # dev-staging-stream
+        slack-message: "Periodic pagebench testing on dedicated hardware: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+
+    - name: Cleanup Test Resources
+      if: always() 
+      run: |
+        curl -k -X 'POST' \
+        "${EC2_MACHINE_URL_US}/cleanup_test/${GITHUB_RUN_ID}" \
+        -H 'accept: application/json' \
+        -H "Authorization: Bearer $API_KEY" \
+        -d ''
+
+    - name: Stop EC2 instance and wait for the instance to be stopped
+      if: always() && steps.poll_step.outputs.too_many_runs != 'true'
+      run: |
+        aws ec2 stop-instances --instance-ids $AWS_INSTANCE_ID
+        aws ec2 wait instance-stopped --instance-ids $AWS_INSTANCE_ID
diff --git a/.github/workflows/pg-clients.yml b/.github/workflows/pg-clients.yml
new file mode 100644
index 000000000000..e21e45c929c8
--- /dev/null
+++ b/.github/workflows/pg-clients.yml
@@ -0,0 +1,115 @@
+name: Test Postgres client libraries
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '23 02 * * *' # run once a day, timezone is utc
+  pull_request:
+    paths:
+      - '.github/workflows/pg-clients.yml'
+      - 'test_runner/pg_clients/**'
+      - 'poetry.lock'
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref_name }}
+  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+env:
+  DEFAULT_PG_VERSION: 16
+  PLATFORM: neon-captest-new
+  AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
+  AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
+  AWS_DEFAULT_REGION: eu-central-1
+
+jobs:
+  check-permissions:
+    if: ${{ !contains(github.event.pull_request.labels.*.name, 'run-no-ci') }}
+    uses: ./.github/workflows/check-permissions.yml
+    with:
+      github-event-name: ${{ github.event_name }}
+
+  check-build-tools-image:
+    needs: [ check-permissions ]
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
+    with:
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
+    secrets: inherit
+
+  test-postgres-client-libs:
+    needs: [ build-build-tools-image ]
+    runs-on: ubuntu-22.04
+
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init --user root
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Create Neon Project
+      id: create-neon-project
+      uses: ./.github/actions/neon-project-create
+      with:
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
+
+    - name: Run tests
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: remote
+        test_selection: pg_clients
+        run_in_parallel: false
+        extra_params: -m remote_cluster
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
+
+    - name: Delete Neon Project
+      if: always()
+      uses: ./.github/actions/neon-project-delete
+      with:
+        project_id: ${{ steps.create-neon-project.outputs.project_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      if: ${{ !cancelled() }}
+      id: create-allure-report
+      uses: ./.github/actions/allure-report-generate
+      with:
+        store-test-results-into-db: true
+      env:
+        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
+    - name: Post to a Slack channel
+      if: github.event.schedule && failure()
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
+        slack-message: |
+          Testing Postgres clients: <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|${{ job.status }}> (<${{ steps.create-allure-report.outputs.report-url }}|test report>)
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/.github/workflows/pg_clients.yml b/.github/workflows/pg_clients.yml
deleted file mode 100644
index fef3aec754b2..000000000000
--- a/.github/workflows/pg_clients.yml
+++ /dev/null
@@ -1,98 +0,0 @@
-name: Test Postgres client libraries
-
-on:
-  schedule:
-    # * is a special character in YAML so you have to quote this string
-    #          ┌───────────── minute (0 - 59)
-    #          │ ┌───────────── hour (0 - 23)
-    #          │ │ ┌───────────── day of the month (1 - 31)
-    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
-    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '23 02 * * *' # run once a day, timezone is utc
-
-  workflow_dispatch:
-
-concurrency:
-  # Allow only one workflow per any non-`main` branch.
-  group: ${{ github.workflow }}-${{ github.ref_name }}-${{ github.ref_name == 'main' && github.sha || 'anysha' }}
-  cancel-in-progress: true
-
-jobs:
-  test-postgres-client-libs:
-    # TODO: switch to gen2 runner, requires docker
-    runs-on: ubuntu-22.04
-
-    env:
-      DEFAULT_PG_VERSION: 14
-      TEST_OUTPUT: /tmp/test_output
-
-    steps:
-    - name: Checkout
-      uses: actions/checkout@v4
-
-    - uses: actions/setup-python@v4
-      with:
-        python-version: 3.9
-
-    - name: Install Poetry
-      uses: snok/install-poetry@v1
-
-    - name: Cache poetry deps
-      uses: actions/cache@v4
-      with:
-        path: ~/.cache/pypoetry/virtualenvs
-        key: v2-${{ runner.os }}-python-deps-ubunutu-latest-${{ hashFiles('poetry.lock') }}
-
-    - name: Install Python deps
-      shell: bash -euxo pipefail {0}
-      run: ./scripts/pysync
-
-    - name: Create Neon Project
-      id: create-neon-project
-      uses: ./.github/actions/neon-project-create
-      with:
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-        postgres_version: ${{ env.DEFAULT_PG_VERSION }}
-
-    - name: Run pytest
-      env:
-        REMOTE_ENV: 1
-        BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
-        POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
-      shell: bash -euxo pipefail {0}
-      run: |
-        # Test framework expects we have psql binary;
-        # but since we don't really need it in this test, let's mock it
-        mkdir -p "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin" && touch "$POSTGRES_DISTRIB_DIR/v${DEFAULT_PG_VERSION}/bin/psql";
-        ./scripts/pytest \
-          --junitxml=$TEST_OUTPUT/junit.xml \
-          --tb=short \
-          --verbose \
-          -m "remote_cluster" \
-          -rA "test_runner/pg_clients"
-
-    - name: Delete Neon Project
-      if: ${{ always() }}
-      uses: ./.github/actions/neon-project-delete
-      with:
-        project_id: ${{ steps.create-neon-project.outputs.project_id }}
-        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
-
-    # We use GitHub's action upload-artifact because `ubuntu-latest` doesn't have configured AWS CLI.
-    # It will be fixed after switching to gen2 runner
-    - name: Upload python test logs
-      if: always()
-      uses: actions/upload-artifact@v4
-      with:
-        retention-days: 7
-        name: python-test-pg_clients-${{ runner.os }}-stage-logs
-        path: ${{ env.TEST_OUTPUT }}
-
-    - name: Post to a Slack channel
-      if: ${{ github.event.schedule && failure() }}
-      uses: slackapi/slack-github-action@v1
-      with:
-        channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Testing Postgres clients: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/Cargo.lock b/Cargo.lock
index 5393538c5902..776d95c3c745 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1397,9 +1397,9 @@ dependencies = [
 
 [[package]]
 name = "crc32c"
-version = "0.6.5"
+version = "0.6.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89254598aa9b9fa608de44b3ae54c810f0f06d755e24c50177f1f8f31ff50ce2"
+checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47"
 dependencies = [
  "rustc_version",
 ]
@@ -1651,6 +1651,16 @@ dependencies = [
  "rusticata-macros",
 ]
 
+[[package]]
+name = "deranged"
+version = "0.3.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b42b6fa04a440b495c8b04d0e71b707c585f83cb9cb28cf8cd0d976c315e31b4"
+dependencies = [
+ "powerfmt",
+ "serde",
+]
+
 [[package]]
 name = "desim"
 version = "0.1.0"
@@ -3008,9 +3018,9 @@ checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
 
 [[package]]
 name = "measured"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "652bc741286361c06de8cb4d89b21a6437f120c508c51713663589eeb9928ac5"
+checksum = "3051f3a030d55d680cdef6ca50e80abd1182f8da29f2344a7c9cb575721138f0"
 dependencies = [
  "bytes",
  "crossbeam-utils",
@@ -3026,9 +3036,9 @@ dependencies = [
 
 [[package]]
 name = "measured-derive"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ea497f33e1e856a376c32ad916f69a0bd3c597db1f912a399f842b01a4a685d"
+checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
  "heck 0.5.0",
  "proc-macro2",
@@ -3038,9 +3048,9 @@ dependencies = [
 
 [[package]]
 name = "measured-process"
-version = "0.0.21"
+version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b364ccb66937a814b6b2ad751d1a2f7a9d5a78c761144036825fb36bb0771000"
+checksum = "7c4b80445aeb08e832d87bf1830049a924cdc1d6b7ef40b6b9b365bff17bf8ec"
 dependencies = [
  "libc",
  "measured",
@@ -3275,6 +3285,12 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "num-conv"
+version = "0.1.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
+
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -3667,6 +3683,7 @@ dependencies = [
  "sysinfo",
  "tenant_size_model",
  "thiserror",
+ "tikv-jemallocator",
  "tokio",
  "tokio-epoll-uring",
  "tokio-io-timeout",
@@ -4077,6 +4094,7 @@ dependencies = [
  "tokio-postgres",
  "tokio-postgres-rustls",
  "tokio-rustls 0.25.0",
+ "tokio-util",
  "tracing",
  "workspace_hack",
 ]
@@ -4117,6 +4135,12 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "powerfmt"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -5396,9 +5420,9 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
 
 [[package]]
 name = "serde"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32ac8da02677876d532745a130fc9d8e6edfa81a269b107c5b00829b91d8eb3c"
+checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094"
 dependencies = [
  "serde_derive",
 ]
@@ -5415,9 +5439,9 @@ dependencies = [
 
 [[package]]
 name = "serde_derive"
-version = "1.0.183"
+version = "1.0.203"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aafe972d60b0b9bee71a91b92fee2d4fb3c9d7e8f6b179aa99f27203d99a4816"
+checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6107,12 +6131,15 @@ dependencies = [
 
 [[package]]
 name = "time"
-version = "0.3.21"
+version = "0.3.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8f3403384eaacbca9923fa06940178ac13e4edb725486d70e8e15881d0c836cc"
+checksum = "5dfd88e563464686c916c7e46e623e520ddc6d79fa6641390f2e3fa86e83e885"
 dependencies = [
+ "deranged",
  "itoa",
  "js-sys",
+ "num-conv",
+ "powerfmt",
  "serde",
  "time-core",
  "time-macros",
@@ -6120,16 +6147,17 @@ dependencies = [
 
 [[package]]
 name = "time-core"
-version = "0.1.1"
+version = "0.1.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7300fbefb4dadc1af235a9cef3737cea692a9d97e1b9cbcd4ebdae6f8868e6fb"
+checksum = "ef927ca75afb808a4d64dd374f00a2adf8d0fcff8e7b184af886c3c87ec4a3f3"
 
 [[package]]
 name = "time-macros"
-version = "0.2.9"
+version = "0.2.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "372950940a5f07bf38dbe211d7283c9e6d7327df53794992d293e534c733d09b"
+checksum = "3f252a68540fde3a3877aeea552b832b40ab9a69e318efd078774a01ddee1ccf"
 dependencies = [
+ "num-conv",
  "time-core",
 ]
 
@@ -6811,6 +6839,7 @@ dependencies = [
  "tokio-stream",
  "tokio-tar",
  "tokio-util",
+ "toml_edit 0.19.10",
  "tracing",
  "tracing-error",
  "tracing-subscriber",
@@ -7426,13 +7455,12 @@ dependencies = [
  "clap",
  "clap_builder",
  "crossbeam-utils",
+ "deranged",
  "either",
  "fail",
  "futures-channel",
- "futures-core",
  "futures-executor",
  "futures-io",
- "futures-sink",
  "futures-util",
  "getrandom 0.2.11",
  "hashbrown 0.14.5",
@@ -7450,7 +7478,9 @@ dependencies = [
  "num-traits",
  "once_cell",
  "parquet",
+ "proc-macro2",
  "prost",
+ "quote",
  "rand 0.8.5",
  "regex",
  "regex-automata 0.4.3",
@@ -7467,6 +7497,7 @@ dependencies = [
  "syn 1.0.109",
  "syn 2.0.52",
  "sync_wrapper",
+ "tikv-jemalloc-sys",
  "time",
  "time-macros",
  "tokio",
diff --git a/Cargo.toml b/Cargo.toml
index 8fddaaef12dd..fc3dd5180922 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -111,8 +111,8 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.21", features=["lasso"] }
-measured-process = { version = "0.0.21" }
+measured = { version = "0.0.22", features=["lasso"] }
+measured-process = { version = "0.0.22" }
 memoffset = "0.8"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
 notify = "6.0.0"
diff --git a/Dockerfile b/Dockerfile
index b4900d4a94a1..a41598ef72cd 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -42,12 +42,13 @@ ARG CACHEPOT_BUCKET=neon-github-dev
 COPY --from=pg-build /home/nonroot/pg_install/v14/include/postgresql/server pg_install/v14/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_install/v15/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
+COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --chown=nonroot . .
 
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build  \
+    && PQ_LIB_DIR=$(pwd)/pg_install/v16/lib RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build \
       --bin pg_sni_router  \
       --bin pageserver  \
       --bin pagectl  \
@@ -56,6 +57,7 @@ RUN set -e \
       --bin storage_controller  \
       --bin proxy  \
       --bin neon_local \
+      --bin storage_scrubber \
       --locked --release \
     && cachepot -s
 
@@ -82,6 +84,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin
 
 COPY --from=pg-build /home/nonroot/pg_install/v14 /usr/local/v14/
 COPY --from=pg-build /home/nonroot/pg_install/v15 /usr/local/v15/
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index f85706ef6a44..4826b7914e42 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -1,5 +1,13 @@
 FROM debian:bullseye-slim
 
+# Use ARG as a build-time environment variable here to allow.
+# It's not supposed to be set outside.
+# Alternatively it can be obtained using the following command
+# ```
+# . /etc/os-release && echo "${VERSION_CODENAME}"
+# ```
+ARG DEBIAN_VERSION_CODENAME=bullseye
+
 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
 SHELL ["/bin/bash", "-c"]
@@ -26,7 +34,6 @@ RUN set -e \
         liblzma-dev \
         libncurses5-dev \
         libncursesw5-dev \
-        libpq-dev \
         libreadline-dev \
         libseccomp-dev \
         libsqlite3-dev \
@@ -67,12 +74,24 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
 # LLVM
 ENV LLVM_VERSION=18
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
-    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
+    && echo "deb http://apt.llvm.org/${DEBIAN_VERSION_CODENAME}/ llvm-toolchain-${DEBIAN_VERSION_CODENAME}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
     && apt update \
     && apt install -y clang-${LLVM_VERSION} llvm-${LLVM_VERSION} \
     && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
+# Install docker
+RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
+    && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION_CODENAME} stable" > /etc/apt/sources.list.d/docker.list \
+    && apt update \
+    && apt install -y docker-ce docker-ce-cli \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
+# Configure sudo & docker
+RUN usermod -aG sudo nonroot && \
+    echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers && \
+    usermod -aG docker nonroot
+
 # AWS CLI
 RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "awscliv2.zip" \
     && unzip -q awscliv2.zip \
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index a79b666409ae..eced6fc0b2e7 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -798,7 +798,11 @@ impl ComputeNode {
         // In this case we need to connect with old `zenith_admin` name
         // and create new user. We cannot simply rename connected user,
         // but we can create a new one and grant it all privileges.
-        let connstr = self.connstr.clone();
+        let mut connstr = self.connstr.clone();
+        connstr
+            .query_pairs_mut()
+            .append_pair("application_name", "apply_config");
+
         let mut client = match Client::connect(connstr.as_str(), NoTls) {
             Err(e) => match e.code() {
                 Some(&SqlState::INVALID_PASSWORD)
@@ -867,15 +871,19 @@ impl ComputeNode {
 
         // Run migrations separately to not hold up cold starts
         thread::spawn(move || {
+            let mut connstr = connstr.clone();
+            connstr
+                .query_pairs_mut()
+                .append_pair("application_name", "migrations");
+
             let mut client = Client::connect(connstr.as_str(), NoTls)?;
             handle_migrations(&mut client).context("apply_config handle_migrations")
         });
         Ok(())
     }
 
-    // We could've wrapped this around `pg_ctl reload`, but right now we don't use
-    // `pg_ctl` for start / stop, so this just seems much easier to do as we already
-    // have opened connection to Postgres and superuser access.
+    // Wrapped this around `pg_ctl reload`, but right now we don't use
+    // `pg_ctl` for start / stop.
     #[instrument(skip_all)]
     fn pg_reload_conf(&self) -> Result<()> {
         let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
@@ -1387,7 +1395,9 @@ pub fn forward_termination_signal() {
     let pg_pid = PG_PID.load(Ordering::SeqCst);
     if pg_pid != 0 {
         let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
-        kill(pg_pid, Signal::SIGQUIT).ok();
+        // Use 'fast' shutdown (SIGINT) because it also creates a shutdown checkpoint, which is important for
+        // ROs to get a list of running xacts faster instead of going through the CLOG.
+        // See https://www.postgresql.org/docs/current/server-shutdown.html for the list of modes and signals.
+        kill(pg_pid, Signal::SIGINT).ok();
     }
 }
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 18c228ba5427..543d4462ed1c 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -11,6 +11,7 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod extension_server;
+mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
diff --git a/compute_tools/src/migration.rs b/compute_tools/src/migration.rs
new file mode 100644
index 000000000000..61dcf01c8448
--- /dev/null
+++ b/compute_tools/src/migration.rs
@@ -0,0 +1,100 @@
+use anyhow::{Context, Result};
+use postgres::Client;
+use tracing::info;
+
+pub(crate) struct MigrationRunner<'m> {
+    client: &'m mut Client,
+    migrations: &'m [&'m str],
+}
+
+impl<'m> MigrationRunner<'m> {
+    pub fn new(client: &'m mut Client, migrations: &'m [&'m str]) -> Self {
+        Self { client, migrations }
+    }
+
+    fn get_migration_id(&mut self) -> Result<i64> {
+        let query = "SELECT id FROM neon_migration.migration_id";
+        let row = self
+            .client
+            .query_one(query, &[])
+            .context("run_migrations get migration_id")?;
+
+        Ok(row.get::<&str, i64>("id"))
+    }
+
+    fn update_migration_id(&mut self) -> Result<()> {
+        let setval = format!(
+            "UPDATE neon_migration.migration_id SET id={}",
+            self.migrations.len()
+        );
+
+        self.client
+            .simple_query(&setval)
+            .context("run_migrations update id")?;
+
+        Ok(())
+    }
+
+    fn prepare_migrations(&mut self) -> Result<()> {
+        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+        self.client.simple_query(query)?;
+
+        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+        self.client.simple_query(query)?;
+
+        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+        self.client.simple_query(query)?;
+
+        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+        self.client.simple_query(query)?;
+
+        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+        self.client.simple_query(query)?;
+
+        Ok(())
+    }
+
+    pub fn run_migrations(mut self) -> Result<()> {
+        self.prepare_migrations()?;
+
+        let mut current_migration: usize = self.get_migration_id()? as usize;
+        let starting_migration_id = current_migration;
+
+        let query = "BEGIN";
+        self.client
+            .simple_query(query)
+            .context("run_migrations begin")?;
+
+        while current_migration < self.migrations.len() {
+            let migration = self.migrations[current_migration];
+
+            if migration.starts_with("-- SKIP") {
+                info!("Skipping migration id={}", current_migration);
+            } else {
+                info!(
+                    "Running migration id={}:\n{}\n",
+                    current_migration, migration
+                );
+                self.client.simple_query(migration).with_context(|| {
+                    format!("run_migration current_migration={}", current_migration)
+                })?;
+            }
+
+            current_migration += 1;
+        }
+
+        self.update_migration_id()?;
+
+        let query = "COMMIT";
+        self.client
+            .simple_query(query)
+            .context("run_migrations commit")?;
+
+        info!(
+            "Ran {} migrations",
+            (self.migrations.len() - starting_migration_id)
+        );
+
+        Ok(())
+    }
+}
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 872a3f775070..d7127aac32d4 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -17,7 +17,11 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
 // should be handled gracefully.
 fn watch_compute_activity(compute: &ComputeNode) {
     // Suppose that `connstr` doesn't change
-    let connstr = compute.connstr.as_str();
+    let mut connstr = compute.connstr.clone();
+    connstr
+        .query_pairs_mut()
+        .append_pair("application_name", "compute_activity_monitor");
+    let connstr = connstr.as_str();
 
     // During startup and configuration we connect to every Postgres database,
     // but we don't want to count this as some user activity. So wait until
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index fa0822748b61..863fa9468ff4 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -489,7 +489,7 @@ pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()>
 /// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
 /// - next line starts with timestamp
 /// - EOF
-/// - no new lines were written for the last second
+/// - no new lines were written for the last 100 milliseconds
 async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
     let mut lines = tokio::io::BufReader::new(stderr).lines();
     let timeout_duration = Duration::from_millis(100);
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 143f6c1e5f6f..37090b08fd37 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -10,6 +10,7 @@ use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};
 
 use crate::config;
 use crate::logger::inlinify;
+use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;
 
@@ -791,69 +792,7 @@ pub fn handle_migrations(client: &mut Client) -> Result<()> {
         include_str!("./migrations/0008-revoke_replication_for_previously_allowed_roles.sql"),
     ];
 
-    let mut func = || {
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        client.simple_query(query)?;
-
-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        client.simple_query(query)?;
-
-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        client.simple_query(query)?;
-
-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        client.simple_query(query)?;
-
-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        client.simple_query(query)?;
-        Ok::<_, anyhow::Error>(())
-    };
-    func().context("handle_migrations prepare")?;
-
-    let query = "SELECT id FROM neon_migration.migration_id";
-    let row = client
-        .query_one(query, &[])
-        .context("handle_migrations get migration_id")?;
-    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
-    let starting_migration_id = current_migration;
-
-    let query = "BEGIN";
-    client
-        .simple_query(query)
-        .context("handle_migrations begin")?;
-
-    while current_migration < migrations.len() {
-        let migration = &migrations[current_migration];
-        if migration.starts_with("-- SKIP") {
-            info!("Skipping migration id={}", current_migration);
-        } else {
-            info!(
-                "Running migration id={}:\n{}\n",
-                current_migration, migration
-            );
-            client.simple_query(migration).with_context(|| {
-                format!("handle_migrations current_migration={}", current_migration)
-            })?;
-        }
-        current_migration += 1;
-    }
-    let setval = format!(
-        "UPDATE neon_migration.migration_id SET id={}",
-        migrations.len()
-    );
-    client
-        .simple_query(&setval)
-        .context("handle_migrations update id")?;
-
-    let query = "COMMIT";
-    client
-        .simple_query(query)
-        .context("handle_migrations commit")?;
-
-    info!(
-        "Ran {} migrations",
-        (migrations.len() - starting_migration_id)
-    );
+    MigrationRunner::new(client, &migrations).run_migrations()?;
 
     Ok(())
 }
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index f381337346ff..4bf1b29785e8 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -21,10 +21,8 @@ use pageserver_api::config::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
     DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
-use pageserver_api::controller_api::PlacementPolicy;
-use pageserver_api::models::{
-    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
-};
+use pageserver_api::controller_api::{PlacementPolicy, TenantCreateRequest};
+use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInfo};
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
@@ -848,20 +846,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
 
             let allow_multiple = sub_args.get_flag("allow-multiple");
 
-            // If --safekeepers argument is given, use only the listed safekeeper nodes.
-            let safekeepers =
-                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
-                    let mut safekeepers: Vec<NodeId> = Vec::new();
-                    for sk_id in safekeepers_str.split(',').map(str::trim) {
-                        let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
-                            anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
-                        })?);
-                        safekeepers.push(sk_id);
-                    }
-                    safekeepers
-                } else {
-                    env.safekeepers.iter().map(|sk| sk.id).collect()
-                };
+            // If --safekeepers argument is given, use only the listed
+            // safekeeper nodes; otherwise all from the env.
+            let safekeepers = if let Some(safekeepers) = parse_safekeepers(sub_args)? {
+                safekeepers
+            } else {
+                env.safekeepers.iter().map(|sk| sk.id).collect()
+            };
 
             let endpoint = cplane
                 .endpoints
@@ -965,7 +956,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                         })
                         .collect::<Vec<_>>()
                 };
-            endpoint.reconfigure(pageservers, None).await?;
+            // If --safekeepers argument is given, use only the listed
+            // safekeeper nodes; otherwise all from the env.
+            let safekeepers = parse_safekeepers(sub_args)?;
+            endpoint.reconfigure(pageservers, None, safekeepers).await?;
         }
         "stop" => {
             let endpoint_id = sub_args
@@ -987,6 +981,23 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
     Ok(())
 }
 
+/// Parse --safekeepers as list of safekeeper ids.
+fn parse_safekeepers(sub_args: &ArgMatches) -> Result<Option<Vec<NodeId>>> {
+    if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
+        let mut safekeepers: Vec<NodeId> = Vec::new();
+        for sk_id in safekeepers_str.split(',').map(str::trim) {
+            let sk_id = NodeId(
+                u64::from_str(sk_id)
+                    .map_err(|_| anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list"))?,
+            );
+            safekeepers.push(sk_id);
+        }
+        Ok(Some(safekeepers))
+    } else {
+        Ok(None)
+    }
+}
+
 fn handle_mappings(sub_match: &ArgMatches, env: &mut local_env::LocalEnv) -> Result<()> {
     let (sub_name, sub_args) = match sub_match.subcommand() {
         Some(ep_subcommand_data) => ep_subcommand_data,
@@ -1590,7 +1601,7 @@ fn cli() -> Command {
                     .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
                     .arg(endpoint_id_arg.clone())
                     .arg(endpoint_pageserver_id_arg.clone())
-                    .arg(safekeepers_arg)
+                    .arg(safekeepers_arg.clone())
                     .arg(remote_ext_config_args)
                     .arg(create_test_user)
                     .arg(allow_multiple.clone())
@@ -1599,6 +1610,7 @@ fn cli() -> Command {
                 .subcommand(Command::new("reconfigure")
                             .about("Reconfigure the endpoint")
                             .arg(endpoint_pageserver_id_arg)
+                            .arg(safekeepers_arg)
                             .arg(endpoint_id_arg.clone())
                             .arg(tenant_id_arg.clone())
                 )
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index b928bbfc308e..f9bb2da7e7ac 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -499,6 +499,23 @@ impl Endpoint {
             .join(",")
     }
 
+    /// Map safekeepers ids to the actual connection strings.
+    fn build_safekeepers_connstrs(&self, sk_ids: Vec<NodeId>) -> Result<Vec<String>> {
+        let mut safekeeper_connstrings = Vec::new();
+        if self.mode == ComputeMode::Primary {
+            for sk_id in sk_ids {
+                let sk = self
+                    .env
+                    .safekeepers
+                    .iter()
+                    .find(|node| node.id == sk_id)
+                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
+                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
+            }
+        }
+        Ok(safekeeper_connstrings)
+    }
+
     pub async fn start(
         &self,
         auth_token: &Option<String>,
@@ -523,18 +540,7 @@ impl Endpoint {
         let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
         assert!(!pageserver_connstring.is_empty());
 
-        let mut safekeeper_connstrings = Vec::new();
-        if self.mode == ComputeMode::Primary {
-            for sk_id in safekeepers {
-                let sk = self
-                    .env
-                    .safekeepers
-                    .iter()
-                    .find(|node| node.id == sk_id)
-                    .ok_or_else(|| anyhow!("safekeeper {sk_id} does not exist"))?;
-                safekeeper_connstrings.push(format!("127.0.0.1:{}", sk.get_compute_port()));
-            }
-        }
+        let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
 
         // check for file remote_extensions_spec.json
         // if it is present, read it and pass to compute_ctl
@@ -740,6 +746,7 @@ impl Endpoint {
         &self,
         mut pageservers: Vec<(Host, u16)>,
         stripe_size: Option<ShardStripeSize>,
+        safekeepers: Option<Vec<NodeId>>,
     ) -> Result<()> {
         let mut spec: ComputeSpec = {
             let spec_path = self.endpoint_path().join("spec.json");
@@ -774,6 +781,12 @@ impl Endpoint {
             spec.shard_stripe_size = stripe_size.map(|s| s.0 as usize);
         }
 
+        // If safekeepers are not specified, don't change them.
+        if let Some(safekeepers) = safekeepers {
+            let safekeeper_connstrings = self.build_safekeepers_connstrs(safekeepers)?;
+            spec.safekeeper_connstrings = safekeeper_connstrings;
+        }
+
         let client = reqwest::Client::builder()
             .timeout(Duration::from_secs(30))
             .build()
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 6634274d2a55..3ac3ce21df8f 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -325,11 +325,16 @@ impl LocalEnv {
         }
     }
 
+    pub fn pg_dir(&self, pg_version: u32, dir_name: &str) -> anyhow::Result<PathBuf> {
+        Ok(self.pg_distrib_dir(pg_version)?.join(dir_name))
+    }
+
     pub fn pg_bin_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("bin"))
+        self.pg_dir(pg_version, "bin")
     }
+
     pub fn pg_lib_dir(&self, pg_version: u32) -> anyhow::Result<PathBuf> {
-        Ok(self.pg_distrib_dir(pg_version)?.join("lib"))
+        self.pg_dir(pg_version, "lib")
     }
 
     pub fn pageserver_bin(&self) -> PathBuf {
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index da4b98784915..f0403b179622 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -15,10 +15,8 @@ use std::time::Duration;
 
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use futures::SinkExt;
 use pageserver_api::models::{
-    self, AuxFilePolicy, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo,
-    TimelineInfo,
+    self, AuxFilePolicy, LocationConfig, TenantHistorySize, TenantInfo, TimelineInfo,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
@@ -397,28 +395,6 @@ impl PageServerNode {
         }
     }
 
-    pub async fn tenant_create(
-        &self,
-        new_tenant_id: TenantId,
-        generation: Option<u32>,
-        settings: HashMap<&str, &str>,
-    ) -> anyhow::Result<TenantId> {
-        let config = Self::parse_config(settings.clone())?;
-
-        let request = models::TenantCreateRequest {
-            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
-            generation,
-            config,
-            shard_parameters: ShardParameters::default(),
-            // Placement policy is not meaningful for creations not done via storage controller
-            placement_policy: None,
-        };
-        if !settings.is_empty() {
-            bail!("Unrecognized tenant settings: {settings:?}")
-        }
-        Ok(self.http_client.tenant_create(&request).await?)
-    }
-
     pub async fn tenant_config(
         &self,
         tenant_id: TenantId,
@@ -589,60 +565,39 @@ impl PageServerNode {
         pg_wal: Option<(Lsn, PathBuf)>,
         pg_version: u32,
     ) -> anyhow::Result<()> {
-        let (client, conn) = self.page_server_psql_client().await?;
-        // The connection object performs the actual communication with the database,
-        // so spawn it off to run on its own.
-        tokio::spawn(async move {
-            if let Err(e) = conn.await {
-                eprintln!("connection error: {}", e);
-            }
-        });
-        let client = std::pin::pin!(client);
-
         // Init base reader
         let (start_lsn, base_tarfile_path) = base;
         let base_tarfile = tokio::fs::File::open(base_tarfile_path).await?;
-        let base_tarfile = tokio_util::io::ReaderStream::new(base_tarfile);
+        let base_tarfile =
+            mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(base_tarfile));
 
         // Init wal reader if necessary
         let (end_lsn, wal_reader) = if let Some((end_lsn, wal_tarfile_path)) = pg_wal {
             let wal_tarfile = tokio::fs::File::open(wal_tarfile_path).await?;
-            let wal_reader = tokio_util::io::ReaderStream::new(wal_tarfile);
+            let wal_reader =
+                mgmt_api::ReqwestBody::wrap_stream(tokio_util::io::ReaderStream::new(wal_tarfile));
             (end_lsn, Some(wal_reader))
         } else {
             (start_lsn, None)
         };
 
-        let copy_in = |reader, cmd| {
-            let client = &client;
-            async move {
-                let writer = client.copy_in(&cmd).await?;
-                let writer = std::pin::pin!(writer);
-                let mut writer = writer.sink_map_err(|e| {
-                    std::io::Error::new(std::io::ErrorKind::Other, format!("{e}"))
-                });
-                let mut reader = std::pin::pin!(reader);
-                writer.send_all(&mut reader).await?;
-                writer.into_inner().finish().await?;
-                anyhow::Ok(())
-            }
-        };
-
         // Import base
-        copy_in(
-            base_tarfile,
-            format!(
-                "import basebackup {tenant_id} {timeline_id} {start_lsn} {end_lsn} {pg_version}"
-            ),
-        )
-        .await?;
-        // Import wal if necessary
-        if let Some(wal_reader) = wal_reader {
-            copy_in(
-                wal_reader,
-                format!("import wal {tenant_id} {timeline_id} {start_lsn} {end_lsn}"),
+        self.http_client
+            .import_basebackup(
+                tenant_id,
+                timeline_id,
+                start_lsn,
+                end_lsn,
+                pg_version,
+                base_tarfile,
             )
             .await?;
+
+        // Import wal if necessary
+        if let Some(wal_reader) = wal_reader {
+            self.http_client
+                .import_wal(tenant_id, timeline_id, start_lsn, end_lsn, wal_reader)
+                .await?;
         }
 
         Ok(())
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 1c56d5f80fe4..47103a2e0ac5 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -5,12 +5,11 @@ use crate::{
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::{
     controller_api::{
-        NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
+        NodeConfigureRequest, NodeRegisterRequest, TenantCreateRequest, TenantCreateResponse,
+        TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::{
-        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
-        TimelineCreateRequest, TimelineInfo,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardStripeSize, TenantShardId},
 };
@@ -156,16 +155,16 @@ impl StorageController {
         .expect("non-Unicode path")
     }
 
-    /// Find the directory containing postgres binaries, such as `initdb` and `pg_ctl`
+    /// Find the directory containing postgres subdirectories, such `bin` and `lib`
     ///
     /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
     /// to other versions if that one isn't found.  Some automated tests create circumstances
     /// where only one version is available in pg_distrib_dir, such as `test_remote_extensions`.
-    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+    async fn get_pg_dir(&self, dir_name: &str) -> anyhow::Result<Utf8PathBuf> {
         let prefer_versions = [STORAGE_CONTROLLER_POSTGRES_VERSION, 15, 14];
 
         for v in prefer_versions {
-            let path = Utf8PathBuf::from_path_buf(self.env.pg_bin_dir(v)?).unwrap();
+            let path = Utf8PathBuf::from_path_buf(self.env.pg_dir(v, dir_name)?).unwrap();
             if tokio::fs::try_exists(&path).await? {
                 return Ok(path);
             }
@@ -173,11 +172,20 @@ impl StorageController {
 
         // Fall through
         anyhow::bail!(
-            "Postgres binaries not found in {}",
-            self.env.pg_distrib_dir.display()
+            "Postgres directory '{}' not found in {}",
+            dir_name,
+            self.env.pg_distrib_dir.display(),
         );
     }
 
+    pub async fn get_pg_bin_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+        self.get_pg_dir("bin").await
+    }
+
+    pub async fn get_pg_lib_dir(&self) -> anyhow::Result<Utf8PathBuf> {
+        self.get_pg_dir("lib").await
+    }
+
     /// Readiness check for our postgres process
     async fn pg_isready(&self, pg_bin_dir: &Utf8Path) -> anyhow::Result<bool> {
         let bin_path = pg_bin_dir.join("pg_isready");
@@ -230,12 +238,17 @@ impl StorageController {
             .unwrap()
             .join("storage_controller_db");
         let pg_bin_dir = self.get_pg_bin_dir().await?;
+        let pg_lib_dir = self.get_pg_lib_dir().await?;
         let pg_log_path = pg_data_path.join("postgres.log");
 
         if !tokio::fs::try_exists(&pg_data_path).await? {
             // Initialize empty database
             let initdb_path = pg_bin_dir.join("initdb");
             let mut child = Command::new(&initdb_path)
+                .envs(vec![
+                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ])
                 .args(["-D", pg_data_path.as_ref()])
                 .spawn()
                 .expect("Failed to spawn initdb");
@@ -270,7 +283,10 @@ impl StorageController {
             &self.env.base_data_dir,
             pg_bin_dir.join("pg_ctl").as_std_path(),
             db_start_args,
-            [],
+            vec![
+                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ],
             background_process::InitialPidFile::Create(self.postgres_pid_file()),
             retry_timeout,
             || self.pg_isready(&pg_bin_dir),
@@ -325,7 +341,10 @@ impl StorageController {
             &self.env.base_data_dir,
             &self.env.storage_controller_bin(),
             args,
-            [],
+            vec![
+                ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+                ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ],
             background_process::InitialPidFile::Create(self.pid_file()),
             retry_timeout,
             || async {
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 775aedb60001..b2c5dfe58a7f 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -4,13 +4,13 @@ use std::{str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use pageserver_api::{
     controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
+        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy, TenantCreateRequest,
         TenantDescribeResponse, TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
-        ShardParameters, TenantConfig, TenantConfigRequest, TenantCreateRequest,
-        TenantShardSplitRequest, TenantShardSplitResponse,
+        ShardParameters, TenantConfig, TenantConfigRequest, TenantShardSplitRequest,
+        TenantShardSplitResponse,
     },
     shard::{ShardStripeSize, TenantShardId},
 };
@@ -336,14 +336,18 @@ async fn main() -> anyhow::Result<()> {
                 .await?;
         }
         Command::TenantCreate { tenant_id } => {
-            vps_client
-                .tenant_create(&TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
-                    generation: None,
-                    shard_parameters: ShardParameters::default(),
-                    placement_policy: Some(PlacementPolicy::Attached(1)),
-                    config: TenantConfig::default(),
-                })
+            storcon_client
+                .dispatch(
+                    Method::POST,
+                    "v1/tenant".to_string(),
+                    Some(TenantCreateRequest {
+                        new_tenant_id: TenantShardId::unsharded(tenant_id),
+                        generation: None,
+                        shard_parameters: ShardParameters::default(),
+                        placement_policy: Some(PlacementPolicy::Attached(1)),
+                        config: TenantConfig::default(),
+                    }),
+                )
                 .await?;
         }
         Command::TenantDelete { tenant_id } => {
diff --git a/docs/rfcs/033-storage-controller-drain-and-fill.md b/docs/rfcs/033-storage-controller-drain-and-fill.md
new file mode 100644
index 000000000000..77c84cd2a525
--- /dev/null
+++ b/docs/rfcs/033-storage-controller-drain-and-fill.md
@@ -0,0 +1,345 @@
+# Graceful Restarts of Storage Controller Managed Clusters
+
+## Summary
+This RFC describes new storage controller APIs for draining and filling tenant shards from/on pageserver nodes.
+It also covers how these new APIs should be used by an orchestrator (e.g. Ansible) in order to implement
+graceful cluster restarts.
+
+## Motivation
+
+Pageserver restarts cause read availablity downtime for tenants.
+
+For example pageserver-3 @ us-east-1 was unavailable for a randomly
+picked tenant (which requested on-demand activation) for around 30 seconds
+during the restart at 2024-04-03 16:37 UTC.
+
+Note that lots of shutdowns on loaded pageservers do not finish within the
+[10 second systemd enforced timeout](https://github.com/neondatabase/aws/blob/0a5280b383e43c063d43cbf87fa026543f6d6ad4/.github/ansible/systemd/pageserver.service#L16). This means we are shutting down without flushing ephemeral layers
+and have to reingest data in order to serve requests after restarting, potentially making first request latencies worse.
+
+This problem is not yet very acutely felt in storage controller managed pageservers since
+tenant density is much lower there. However, we are planning on eventually migrating all
+pageservers to storage controller management, so it makes sense to solve the issue proactively.
+
+## Requirements
+
+- Pageserver re-deployments cause minimal downtime for tenants
+- The storage controller exposes HTTP API hooks for draining and filling tenant shards
+from a given pageserver. Said hooks can be used by an orchestrator proces or a human operator.
+- The storage controller exposes some HTTP API to cancel draining and filling background operations.
+- Failures to drain or fill the node should not be fatal. In such cases, cluster restarts should proceed
+as usual (with downtime).
+- Progress of draining/filling is visible through metrics
+
+## Non Goals
+
+- Integration with the control plane
+- Graceful restarts for large non-HA tenants.
+
+## Impacted Components
+
+- storage controller
+- deployment orchestrator (i.e. Ansible)
+- pageserver (indirectly)
+
+## Terminology
+
+** Draining ** is the process through which all tenant shards that can be migrated from a given pageserver
+are distributed across the rest of the cluster.
+
+** Filling ** is the symmetric opposite of draining. In this process tenant shards are migrated onto a given
+pageserver until the cluster reaches a resonable, quiescent distribution of tenant shards across pageservers.
+
+** Node scheduling policies ** act as constraints to the scheduler. For instance, when a
+node is set in the `Paused` policy, no further shards will be scheduled on it.
+
+** Node ** is a pageserver. Term is used interchangeably in this RFC.
+
+** Deployment orchestrator ** is a generic term for whatever drives our deployments.
+Currently, it's an Ansible playbook.
+
+## Background
+
+### Storage Controller Basics (skip if already familiar)
+
+Fundamentally, the storage controller is a reconciler which aims to move from the observed mapping between pageservers and tenant shards to an intended mapping. Pageserver nodes and tenant shards metadata is durably persisted in a database, but note that the mapping between the two entities is not durably persisted. Instead, this mapping (*observed state*) is constructed at startup by sending `GET location_config` requests to registered pageservers.
+
+An internal scheduler maps tenant shards to pageservers while respecting certain constraints. The result of scheduling is the *intent state*. When the intent state changes, a *reconciliation* will inform pageservers about the new assigment via `PUT location_config` requests and will notify the compute via the configured hook.
+
+### Background Optimizations
+
+The storage controller performs scheduling optimizations in the background. It will
+migrate attachments to warm secondaries and replace secondaries in order to balance
+the cluster out.
+
+### Reconciliations Concurrency Limiting
+
+There's a hard limit on the number of reconciles that the storage controller
+can have in flight at any given time. To get an idea of scales, the limit is
+128 at the time of writing.
+
+## Implementation
+
+Note: this section focuses on the core functionality of the graceful restart process.
+It doesn't neccesarily describe the most efficient approach. Optimizations are described
+separately in a later section.
+
+### Overall Flow
+
+This section describes how to implement graceful restarts from the perspective
+of Ansible, the deployment orchestrator. Pageservers are already restarted sequentially.
+The orchestrator shall implement the following epilogue and prologue steps for each
+pageserver restart:
+
+#### Prologue
+
+The orchestrator shall first fetch the pageserver node id from the control plane or
+the pageserver it aims to restart directly. Next, it issues an HTTP request
+to the storage controller in order to start the drain of said pageserver node.
+All error responses are retried with a short back-off. When a 202 (Accepted)
+HTTP code is returned, the drain has started. Now the orchestrator polls the
+node status endpoint exposed by the storage controller in order to await the
+end of the drain process. When the `policy` field of the node status response
+becomes `PauseForRestart`, the drain has completed and the orchestrator can
+proceed with restarting the pageserver.
+
+The prologue is subject to an overall timeout. It will have a value in the ballpark
+of minutes. As storage controller managed pageservers become more loaded this timeout
+will likely have to increase.
+
+#### Epilogue
+
+After restarting the pageserver, the orchestrator issues an HTTP request
+to the storage controller to kick off the filling process. This API call
+may be retried for all error codes with a short backoff. This also serves
+as a synchronization primitive as the fill will be refused if the pageserver
+has not yet re-attached to the storage controller. When a 202(Accepted) HTTP
+code is returned, the fill has started. Now the orchestrator polls the node
+status endpoint exposed by the storage controller in order to await the end of
+the filling process. When the `policy` field of the node status response becomes
+`Active`, the fill has completed and the orchestrator may proceed to the next pageserver.
+
+Again, the epilogue is subject to an overall timeout. We can start off with
+using the same timeout as for the prologue, but can also consider relying on
+the storage controller's background optimizations with a shorter timeout.
+
+In the case that the deployment orchestrator times out, it attempts to cancel
+the fill. This operation shall be retried with a short back-off. If it ultimately
+fails it will require manual intervention to set the nodes scheduling policy to
+`NodeSchedulingPolicy::Active`. Not doing that is not immediately problematic,
+but it constrains the scheduler as mentioned previously.
+
+### Node Scheduling Policy State Machine
+
+The state machine below encodes the behaviours discussed above and
+the various failover situations described in a later section.
+
+Assuming no failures and/or timeouts the flow should be:
+`Active -> Draining -> PauseForRestart -> Active -> Filling -> Active`
+
+```
+                          Operator requested drain
+               +-----------------------------------------+
+               |                                         |
+       +-------+-------+                         +-------v-------+
+       |               |                         |               |
+       |     Pause     |             +----------->    Draining   +----------+
+       |               |             |           |               |          |
+       +---------------+             |           +-------+-------+          |
+                                     |                   |                  |
+                                     |                   |                  |
+                      Drain requested|                   |                  |
+                                     |                   |Drain complete    | Drain failed
+                                     |                   |                  | Cancelled/PS reattach/Storcon restart
+                                     |                   |                  |
+                             +-------+-------+           |                  |
+                             |               |           |                  |
+               +-------------+    Active     <-----------+------------------+
+               |             |               |           |
+Fill requested |             +---^---^-------+           |
+               |                 |   |                   |
+               |                 |   |                   |
+               |                 |   |                   |
+               |   Fill completed|   |                   |
+               |                 |   |PS reattach        |
+               |                 |   |after restart      |
+       +-------v-------+         |   |           +-------v-------+
+       |               |         |   |           |               |
+       |    Filling    +---------+   +-----------+PauseForRestart|
+       |               |                         |               |
+       +---------------+                         +---------------+
+```
+
+### Draining/Filling APIs
+
+The storage controller API to trigger the draining of a given node is:
+`PUT /v1/control/node/:node_id/{drain,fill}`.
+
+The following HTTP non-success return codes are used.
+All of them are safely retriable from the perspective of the storage controller.
+- 404: Requested node was not found
+- 503: Requested node is known to the storage controller, but unavailable
+- 412: Drain precondition failed: there is no other node to drain to or the node's schedulling policy forbids draining
+- 409: A {drain, fill} is already in progress. Only one such background operation
+is allowed per node.
+
+When the drain is accepted and commenced a 202 HTTP code is returned.
+
+Drains and fills shall be cancellable by the deployment orchestrator or a
+human operator via: `DELETE /v1/control/node/:node_id/{drain,fill}`. A 200
+response is returned when the cancelation is successful. Errors are retriable.
+
+### Drain Process
+
+Before accpeting a drain request the following validations is applied:
+* Ensure that the node is known the storage controller
+* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active` or `NodeSchedulingPolicy::Pause`
+* Ensure that another drain or fill is not already running on the node
+* Ensure that a drain is possible (i.e. check that there is at least one
+schedulable node to drain to)
+
+After accepting the drain, the scheduling policy of the node is set to
+`NodeSchedulingPolicy::Draining` and persisted in both memory and the database.
+This disallows the optimizer from adding or removing shards from the node which
+is desirable to avoid them racing.
+
+Next, a separate Tokio task is spawned to manage the draining. For each tenant
+shard attached to the node being drained, demote the node to a secondary and
+attempt to schedule the node away. Scheduling might fail due to unsatisfiable
+constraints, but that is fine. Draining is a best effort process since it might
+not always be possible to cut over all shards.
+
+Importantly, this task manages the concurrency of issued reconciles in order to
+avoid drowning out the target pageservers and to allow other important reconciles
+to proceed.
+
+Once the triggered reconciles have finished or timed out, set the node's scheduling
+policy to `NodeSchedulingPolicy::PauseForRestart` to signal the end of the drain.
+
+A note on non HA tenants: These tenants do not have secondaries, so by the description
+above, they would not be migrated. It makes sense to skip them (especially the large ones)
+since, depending on tenant size, this might be more disruptive than the restart since the
+pageserver we've moved to do will need to on-demand download the entire working set for the tenant.
+We can consider expanding to small non-HA tenants in the future.
+
+### Fill Process
+
+Before accpeting a fill request the following validations is applied:
+* Ensure that the node is known the storage controller
+* Ensure that the schedulling policy is `NodeSchedulingPolicy::Active`.
+This is the only acceptable policy for the fill starting state. When a node re-attaches,
+it set the scheduling policy to `NodeSchedulingPolicy::Active` if it was equal to
+`NodeSchedulingPolicy::PauseForRestart` or `NodeSchedulingPolicy::Draining` (possible end states for a node drain).
+* Ensure that another drain or fill is not already running on the node
+
+After accepting the drain, the scheduling policy of the node is set to
+`NodeSchedulingPolicy::Filling` and persisted in both memory and the database.
+This disallows the optimizer from adding or removing shards from the node which
+is desirable to avoid them racing.
+
+Next, a separate Tokio task is spawned to manage the draining. For each tenant
+shard where the filled node is a secondary, promote the secondary. This is done
+until we run out of shards or the counts of attached shards become balanced across
+the cluster.
+
+Like for draining, the concurrency of spawned reconciles is limited.
+
+### Failure Modes & Handling
+
+Failures are generally handled by transition back into the `Active`
+(neutral) state. This simplifies the implementation greatly at the
+cost of adding transitions to the state machine. For example, we
+could detect the `Draining` state upon restart and proceed with a drain,
+but how should the storage controller know that's what the orchestrator
+needs still?
+
+#### Storage Controller Crash
+
+When the storage controller starts up reset the node scheduling policy
+of all nodes in states `Draining`, `Filling` or `PauseForRestart` to
+`Active`. The rationale is that when the storage controller restarts,
+we have lost context of what the deployment orchestrator wants. It also
+has the benefit of making things easier to reason about.
+
+#### Pageserver Crash During Drain
+
+The pageserver will attempt to re-attach during restart at which
+point the node scheduling policy will be set back to `Active`, thus
+reenabling the scheduler to use the node.
+
+#### Non-drained Pageserver Crash During Drain
+
+What should happen when a pageserver we are draining to crashes during the
+process. Two reasonable options are: cancel the drain and focus on the failover
+*or* do both, but prioritise failover. Since the number of concurrent reconciles
+produced by drains/fills are limited, we get the later behaviour for free.
+My suggestion is we take this approach, but the cancellation option is trivial
+to implement as well.
+
+#### Pageserver Crash During Fill
+
+The pageserver will attempt to re-attach during restart at which
+point the node scheduling policy will be set back to `Active`, thus
+reenabling the scheduler to use the node.
+
+#### Pageserver Goes unavailable During Drain/Fill
+
+The drain and fill jobs handle this by stopping early. When the pageserver
+is detected as online by storage controller heartbeats, reset its scheduling
+policy to `Active`. If a restart happens instead, see the pageserver crash
+failure mode.
+
+#### Orchestrator Drain Times Out
+
+Orchestrator will still proceed with the restart.
+When the pageserver re-attaches, the scheduling policy is set back to
+`Active`.
+
+#### Orchestrator Fill Times Out
+
+Orchestrator will attempt to cancel the fill operation. If that fails,
+the fill will continue until it quiesces and the node will be left
+in the `Filling` scheduling policy. This hinders the scheduler, but is
+otherwise harmless. A human operator can handle this by setting the scheduling
+policy to `Active`, or we can bake in a fill timeout into the storage controller.
+
+## Optimizations
+
+### Location Warmth
+
+When cutting over to a secondary, the storage controller will wait for it to
+become "warm" (i.e. download enough of the tenants data). This means that some
+reconciliations can take significantly longer than others and hold up precious
+reconciliations units. As an optimization, the drain stage can only cut over
+tenants that are already "warm". Similarly, the fill stage can prioritise the
+"warmest" tenants in the fill.
+
+Given that the number of tenants by the storage controller will be fairly low
+for the foreseable future, the first implementation could simply query the tenants
+for secondary status. This doesn't scale well with increasing tenant counts, so
+eventually we will need new pageserver API endpoints to report the sets of
+"warm" and "cold" nodes.
+
+## Alternatives Considered
+
+### Draining and Filling Purely as Scheduling Constraints
+
+At its core, the storage controller is a big background loop that detects changes
+in the environment and reacts on them. One could express draining and filling
+of nodes purely in terms of constraining the scheduler (as opposed to having
+such background tasks).
+
+While theoretically nice, I think that's harder to implement and more importantly operate and reason about.
+Consider cancellation of a drain/fill operation. We would have to update the scheduler state, create
+an entirely new schedule (intent state) and start work on applying that. It gets trickier if we wish
+to cancel the reconciliation tasks spawned by drain/fill nodes. How would we know which ones belong
+to the conceptual drain/fill? One could add labels to reconciliations, but it gets messy in my opinion.
+
+It would also mean that reconciliations themselves have side effects that persist in the database
+(persist something to the databse when the drain is done), which I'm not conceptually fond of.
+
+## Proof of Concept
+
+This RFC is accompanied by a POC which implements nearly everything mentioned here
+apart from the optimizations and some of the failure handling:
+https://github.com/neondatabase/neon/pull/7682
diff --git a/libs/metrics/src/hll.rs b/libs/metrics/src/hll.rs
index f53511ab5cc3..723916a7421a 100644
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -13,11 +13,7 @@ use std::{
 
 use measured::{
     label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
-    metric::{
-        group::{Encoding, MetricValue},
-        name::MetricNameEncoder,
-        Metric, MetricType, MetricVec,
-    },
+    metric::{counter::CounterState, name::MetricNameEncoder, Metric, MetricType, MetricVec},
     text::TextEncoder,
     LabelGroup,
 };
@@ -144,6 +140,7 @@ impl<const N: usize> HyperLogLogState<N> {
         })
     }
 }
+
 impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
     for HyperLogLogState<N>
 {
@@ -182,12 +179,13 @@ impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEnc
             .into_iter()
             .enumerate()
             .try_for_each(|(hll_shard, val)| {
-                enc.write_metric_value(
-                    name.by_ref(),
+                CounterState::new(val as u64).collect_into(
+                    &(),
                     labels.by_ref().compose_with(HllShardLabel {
                         hll_shard: hll_shard as i64,
                     }),
-                    MetricValue::Int(val as i64),
+                    name.by_ref(),
+                    enc,
                 )
             })
     }
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 141d8a6d0198..df000cd0fb28 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -9,7 +9,7 @@ use measured::{
     metric::{
         counter::CounterState,
         gauge::GaugeState,
-        group::{Encoding, MetricValue},
+        group::Encoding,
         name::{MetricName, MetricNameEncoder},
         MetricEncoding, MetricFamilyEncoding,
     },
@@ -103,9 +103,10 @@ static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
     .expect("Failed to register maxrss_kb int gauge")
 });
 
-pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
-    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
-];
+/// Most common fsync latency is 50 µs - 100 µs, but it can be much higher,
+/// especially during many concurrent disk operations.
+pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] =
+    &[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0];
 
 pub struct BuildInfo {
     pub revision: &'static str,
@@ -170,8 +171,11 @@ fn write_gauge<Enc: Encoding>(
     labels: impl LabelGroup,
     name: impl MetricNameEncoder,
     enc: &mut Enc,
-) -> Result<(), Enc::Err> {
-    enc.write_metric_value(name, labels, MetricValue::Int(x))
+) -> Result<(), Enc::Err>
+where
+    GaugeState: MetricEncoding<Enc>,
+{
+    GaugeState::new(x).collect_into(&(), labels, name, enc)
 }
 
 #[derive(Default)]
@@ -543,15 +547,6 @@ impl<T: Encoding> Encoding for Inc<T> {
     fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
         self.0.write_help(name, help)
     }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
 }
 
 impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
@@ -578,15 +573,6 @@ impl<T: Encoding> Encoding for Dec<T> {
     fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
         self.0.write_help(name, help)
     }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
 }
 
 /// Write the dec counter to the encoder
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index a0d10dc665dc..f05c1315eafa 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -11,6 +11,27 @@ use crate::{
     shard::{ShardStripeSize, TenantShardId},
 };
 
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct TenantCreateRequest {
+    pub new_tenant_id: TenantShardId,
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
+
+    // If omitted, create a single shard with TenantShardId::unsharded()
+    #[serde(default)]
+    #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
+    pub shard_parameters: ShardParameters,
+
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub placement_policy: Option<PlacementPolicy>,
+
+    #[serde(flatten)]
+    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
     pub shard_id: TenantShardId,
@@ -280,4 +301,19 @@ mod test {
         assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
         Ok(())
     }
+
+    #[test]
+    fn test_reject_unknown_field() {
+        let id = TenantId::generate();
+        let create_request = serde_json::json!({
+            "new_tenant_id": id.to_string(),
+            "unknown_field": "unknown_value".to_string(),
+        });
+        let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
+        assert!(
+            err.to_string().contains("unknown field `unknown_field`"),
+            "expect unknown field `unknown_field` error, got: {}",
+            err
+        );
+    }
 }
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index cd430bfab7d4..0acd83753eff 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -29,7 +29,7 @@ pub const KEY_SIZE: usize = 18;
 /// See [`Key::to_i128`] for more information on the encoding.
 pub const METADATA_KEY_SIZE: usize = 16;
 
-/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
+/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x60 is a metadata key.
 pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
 pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
 
diff --git a/libs/pageserver_api/src/keyspace.rs b/libs/pageserver_api/src/keyspace.rs
index 9a61f2ad81ae..401887d3629c 100644
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -17,6 +17,16 @@ pub struct KeySpace {
     pub ranges: Vec<Range<Key>>,
 }
 
+impl std::fmt::Display for KeySpace {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+        for range in &self.ranges {
+            write!(f, "{}..{},", range.start, range.end)?;
+        }
+        write!(f, "]")
+    }
+}
+
 /// A wrapper type for sparse keyspaces.
 #[derive(Clone, Debug, Default, PartialEq, Eq)]
 pub struct SparseKeySpace(pub KeySpace);
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 4875f4949522..d360cc6e870f 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -9,6 +9,7 @@ use std::{
     collections::HashMap,
     io::{BufRead, Read},
     num::{NonZeroU64, NonZeroUsize},
+    str::FromStr,
     sync::atomic::AtomicUsize,
     time::{Duration, SystemTime},
 };
@@ -25,7 +26,6 @@ use utils::{
     serde_system_time,
 };
 
-use crate::controller_api::PlacementPolicy;
 use crate::{
     reltag::RelTag,
     shard::{ShardCount, ShardStripeSize, TenantShardId},
@@ -229,6 +229,11 @@ pub struct TimelineCreateRequest {
     pub pg_version: Option<u32>,
 }
 
+#[derive(Serialize, Deserialize, Clone)]
+pub struct LsnLeaseRequest {
+    pub lsn: Lsn,
+}
+
 #[derive(Serialize, Deserialize)]
 pub struct TenantShardSplitRequest {
     pub new_shard_count: u8,
@@ -271,28 +276,6 @@ impl Default for ShardParameters {
     }
 }
 
-#[derive(Serialize, Deserialize, Debug)]
-#[serde(deny_unknown_fields)]
-pub struct TenantCreateRequest {
-    pub new_tenant_id: TenantShardId,
-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub generation: Option<u32>,
-
-    // If omitted, create a single shard with TenantShardId::unsharded()
-    #[serde(default)]
-    #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
-    pub shard_parameters: ShardParameters,
-
-    // This parameter is only meaningful in requests sent to the storage controller
-    #[serde(default)]
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub placement_policy: Option<PlacementPolicy>,
-
-    #[serde(flatten)]
-    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
-}
-
 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
 #[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
@@ -455,6 +438,41 @@ pub enum CompactionAlgorithm {
     Tiered,
 }
 
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub enum ImageCompressionAlgorithm {
+    // Disabled for writes, support decompressing during read path
+    Disabled,
+    /// Zstandard compression. Level 0 means and None mean the same (default level). Levels can be negative as well.
+    /// For details, see the [manual](http://facebook.github.io/zstd/zstd_manual.html).
+    Zstd {
+        level: Option<i8>,
+    },
+}
+
+impl FromStr for ImageCompressionAlgorithm {
+    type Err = anyhow::Error;
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        let mut components = s.split(['(', ')']);
+        let first = components
+            .next()
+            .ok_or_else(|| anyhow::anyhow!("empty string"))?;
+        match first {
+            "disabled" => Ok(ImageCompressionAlgorithm::Disabled),
+            "zstd" => {
+                let level = if let Some(v) = components.next() {
+                    let v: i8 = v.parse()?;
+                    Some(v)
+                } else {
+                    None
+                };
+
+                Ok(ImageCompressionAlgorithm::Zstd { level })
+            }
+            _ => anyhow::bail!("invalid specifier '{first}'"),
+        }
+    }
+}
+
 #[derive(Eq, PartialEq, Debug, Clone, Serialize, Deserialize)]
 pub struct CompactionAlgorithmSettings {
     pub kind: CompactionAlgorithm,
@@ -547,10 +565,6 @@ pub struct LocationConfigListResponse {
     pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
 }
 
-#[derive(Serialize, Deserialize)]
-#[serde(transparent)]
-pub struct TenantCreateResponse(pub TenantId);
-
 #[derive(Serialize)]
 pub struct StatusResponse {
     pub id: NodeId,
@@ -670,6 +684,16 @@ pub struct TimelineInfo {
     pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
     pub current_logical_size_non_incremental: Option<u64>,
 
+    /// How many bytes of WAL are within this branch's pitr_interval.  If the pitr_interval goes
+    /// beyond the branch's branch point, we only count up to the branch point.
+    pub pitr_history_size: u64,
+
+    /// Whether this branch's branch point is within its ancestor's PITR interval (i.e. any
+    /// ancestor data used by this branch would have been retained anyway).  If this is false, then
+    /// this branch may be imposing a cost on the ancestor by causing it to retain layers that it would
+    /// otherwise be able to GC.
+    pub within_ancestor_pitr: bool,
+
     pub timeline_dir_layer_file_size_sum: Option<u64>,
 
     pub wal_source_connstr: Option<String>,
@@ -1507,18 +1531,6 @@ mod tests {
 
     #[test]
     fn test_reject_unknown_field() {
-        let id = TenantId::generate();
-        let create_request = json!({
-            "new_tenant_id": id.to_string(),
-            "unknown_field": "unknown_value".to_string(),
-        });
-        let err = serde_json::from_value::<TenantCreateRequest>(create_request).unwrap_err();
-        assert!(
-            err.to_string().contains("unknown field `unknown_field`"),
-            "expect unknown field `unknown_field` error, got: {}",
-            err
-        );
-
         let id = TenantId::generate();
         let config_request = json!({
             "tenant_id": id.to_string(),
@@ -1653,4 +1665,25 @@ mod tests {
             AuxFilePolicy::CrossValidation
         );
     }
+
+    #[test]
+    fn test_image_compression_algorithm_parsing() {
+        use ImageCompressionAlgorithm::*;
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("disabled").unwrap(),
+            Disabled
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd").unwrap(),
+            Zstd { level: None }
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd(18)").unwrap(),
+            Zstd { level: Some(18) }
+        );
+        assert_eq!(
+            ImageCompressionAlgorithm::from_str("zstd(-3)").unwrap(),
+            Zstd { level: Some(-3) }
+        );
+    }
 }
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 8c5a4e616869..e83cf4c855a1 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,59 +1,42 @@
-use std::{ops::RangeInclusive, str::FromStr};
+//! See docs/rfcs/031-sharding-static.md for an overview of sharding.
+//!
+//! This module contains a variety of types used to represent the concept of sharding
+//! a Neon tenant across multiple physical shards.  Since there are quite a few of these,
+//! we provide an summary here.
+//!
+//! Types used to describe shards:
+//! - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
+//!   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
+//!   a shard suffix.
+//! - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
+//! - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
+//!   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
+//!   tenant, such as layer files.
+//! - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
+//!   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
+//! - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
+//!   four hex digits.  An unsharded tenant is `0000`.
+//! - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
+//!
+//! Types used to describe the parameters for data distribution in a sharded tenant:
+//! - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
+//!   multiple shards.  Its value is given in 8kiB pages.
+//! - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
+//!   always zero: this is provided for future upgrades that might introduce different
+//!   data distribution schemes.
+//!
+//! Examples:
+//! - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
+//! - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
+//! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
+//!   and their slugs are 0004, 0104, 0204, and 0304.
 
 use crate::{key::Key, models::ShardParameters};
-use hex::FromHex;
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
-use utils::id::TenantId;
 
-/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
-///
-/// This module contains a variety of types used to represent the concept of sharding
-/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
-/// we provide an summary here.
-///
-/// Types used to describe shards:
-/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
-///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
-///   a shard suffix.
-/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
-/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
-///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
-///   tenant, such as layer files.
-/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
-///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
-/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
-///   four hex digits.  An unsharded tenant is `0000`.
-/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
-///
-/// Types used to describe the parameters for data distribution in a sharded tenant:
-/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
-///   multiple shards.  Its value is given in 8kiB pages.
-/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
-///   always zero: this is provided for future upgrades that might introduce different
-///   data distribution schemes.
-///
-/// Examples:
-/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
-/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
-/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
-///   and their slugs are 0004, 0104, 0204, and 0304.
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardNumber(pub u8);
-
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(u8);
-
-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
+#[doc(inline)]
+pub use ::utils::shard::*;
 
 /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
 /// and to check whether that [`ShardNumber`] is the same as the current shard.
@@ -65,362 +48,6 @@ pub struct ShardIdentity {
     layout: ShardLayout,
 }
 
-/// Formatting helper, for generating the `shard_id` label in traces.
-struct ShardSlug<'a>(&'a TenantShardId);
-
-/// TenantShardId globally identifies a particular shard in a particular tenant.
-///
-/// These are written as `<TenantId>-<ShardSlug>`, for example:
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
-/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
-/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
-///
-/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible with TenantId: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-impl ShardCount {
-    pub const MAX: Self = Self(u8::MAX);
-
-    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
-    /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as [`TenantShardId::unsharded`].
-    ///
-    /// This method returns the actual number of shards, i.e. if our internal value is
-    /// zero, we return 1 (unsharded tenants have 1 shard).
-    pub fn count(&self) -> u8 {
-        if self.0 > 0 {
-            self.0
-        } else {
-            1
-        }
-    }
-
-    /// The literal internal value: this is **not** the number of shards in the
-    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
-    /// [`Self::count`] if you want to know the cardinality of shards.
-    pub fn literal(&self) -> u8 {
-        self.0
-    }
-
-    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
-    /// uses the legacy format for `TenantShardId`. See also the documentation for
-    /// [`Self::count`].
-    pub fn is_unsharded(&self) -> bool {
-        self.0 == 0
-    }
-
-    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
-    /// [`Self::literal`] would return.
-    pub const fn new(val: u8) -> Self {
-        Self(val)
-    }
-}
-
-impl ShardNumber {
-    pub const MAX: Self = Self(u8::MAX);
-}
-
-impl TenantShardId {
-    pub fn unsharded(tenant_id: TenantId) -> Self {
-        Self {
-            tenant_id,
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
-    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
-    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
-        RangeInclusive::new(
-            Self {
-                tenant_id,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            },
-            Self {
-                tenant_id,
-                shard_number: ShardNumber::MAX,
-                shard_count: ShardCount::MAX,
-            },
-        )
-    }
-
-    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
-        ShardSlug(self)
-    }
-
-    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_shard_zero(&self) -> bool {
-        self.shard_number == ShardNumber(0)
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
-    }
-
-    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
-    /// is useful when logging from code that is already in a span that includes tenant ID, to
-    /// keep messages reasonably terse.
-    pub fn to_index(&self) -> ShardIndex {
-        ShardIndex {
-            shard_number: self.shard_number,
-            shard_count: self.shard_count,
-        }
-    }
-
-    /// Calculate the children of this TenantShardId when splitting the overall tenant into
-    /// the given number of shards.
-    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
-        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
-        let mut child_shards = Vec::new();
-        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
-            // Key mapping is based on a round robin mapping of key hash modulo shard count,
-            // so our child shards are the ones which the same keys would map to.
-            if shard_number % effective_old_shard_count == self.shard_number.0 {
-                child_shards.push(TenantShardId {
-                    tenant_id: self.tenant_id,
-                    shard_number: ShardNumber(shard_number),
-                    shard_count: new_shard_count,
-                })
-            }
-        }
-
-        child_shards
-    }
-}
-
-impl<'a> std::fmt::Display for ShardSlug<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{:02x}{:02x}",
-            self.0.shard_number.0, self.0.shard_count.0
-        )
-    }
-}
-
-impl std::fmt::Display for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        if self.shard_count != ShardCount(0) {
-            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
-        } else {
-            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
-            // is distinct from the normal single shard case (shard count == 1).
-            self.tenant_id.fmt(f)
-        }
-    }
-}
-
-impl std::fmt::Debug for TenantShardId {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for TenantShardId {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
-        if s.len() == 32 {
-            // Legacy case: no shard specified
-            Ok(Self {
-                tenant_id: TenantId::from_str(s)?,
-                shard_number: ShardNumber(0),
-                shard_count: ShardCount(0),
-            })
-        } else if s.len() == 37 {
-            let bytes = s.as_bytes();
-            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
-            Ok(Self {
-                tenant_id,
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 18]> for TenantShardId {
-    fn from(b: [u8; 18]) -> Self {
-        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
-
-        Self {
-            tenant_id: TenantId::from(tenant_id_bytes),
-            shard_number: ShardNumber(b[16]),
-            shard_count: ShardCount(b[17]),
-        }
-    }
-}
-
-impl ShardIndex {
-    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
-        Self {
-            shard_number: number,
-            shard_count: count,
-        }
-    }
-    pub fn unsharded() -> Self {
-        Self {
-            shard_number: ShardNumber(0),
-            shard_count: ShardCount(0),
-        }
-    }
-
-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
-    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
-    }
-
-    /// For use in constructing remote storage paths: concatenate this with a TenantId
-    /// to get a fully qualified TenantShardId.
-    ///
-    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
-    /// that the legacy pre-sharding remote key format is preserved.
-    pub fn get_suffix(&self) -> String {
-        if self.is_unsharded() {
-            "".to_string()
-        } else {
-            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-        }
-    }
-}
-
-impl std::fmt::Display for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
-    }
-}
-
-impl std::fmt::Debug for ShardIndex {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // Debug is the same as Display: the compact hex representation
-        write!(f, "{}", self)
-    }
-}
-
-impl std::str::FromStr for ShardIndex {
-    type Err = hex::FromHexError;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        // Expect format: 1 byte shard number, 1 byte shard count
-        if s.len() == 4 {
-            let bytes = s.as_bytes();
-            let mut shard_parts: [u8; 2] = [0u8; 2];
-            hex::decode_to_slice(bytes, &mut shard_parts)?;
-            Ok(Self {
-                shard_number: ShardNumber(shard_parts[0]),
-                shard_count: ShardCount(shard_parts[1]),
-            })
-        } else {
-            Err(hex::FromHexError::InvalidStringLength)
-        }
-    }
-}
-
-impl From<[u8; 2]> for ShardIndex {
-    fn from(b: [u8; 2]) -> Self {
-        Self {
-            shard_number: ShardNumber(b[0]),
-            shard_count: ShardCount(b[1]),
-        }
-    }
-}
-
-impl Serialize for TenantShardId {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Note: while human encoding of [`TenantShardId`] is backward and forward
-            // compatible, this binary encoding is not.
-            let mut packed: [u8; 18] = [0; 18];
-            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
-            packed[16] = self.shard_number.0;
-            packed[17] = self.shard_count.0;
-
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for TenantShardId {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = TenantShardId;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 18])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 18] = Deserialize::deserialize(s)?;
-                Ok(TenantShardId::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                TenantShardId::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                18,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -585,77 +212,6 @@ impl ShardIdentity {
     }
 }
 
-impl Serialize for ShardIndex {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::Serializer,
-    {
-        if serializer.is_human_readable() {
-            serializer.collect_str(self)
-        } else {
-            // Binary encoding is not used in index_part.json, but is included in anticipation of
-            // switching various structures (e.g. inter-process communication, remote metadata) to more
-            // compact binary encodings in future.
-            let mut packed: [u8; 2] = [0; 2];
-            packed[0] = self.shard_number.0;
-            packed[1] = self.shard_count.0;
-            packed.serialize(serializer)
-        }
-    }
-}
-
-impl<'de> Deserialize<'de> for ShardIndex {
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        struct IdVisitor {
-            is_human_readable_deserializer: bool,
-        }
-
-        impl<'de> serde::de::Visitor<'de> for IdVisitor {
-            type Value = ShardIndex;
-
-            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
-                if self.is_human_readable_deserializer {
-                    formatter.write_str("value in form of hex string")
-                } else {
-                    formatter.write_str("value in form of integer array([u8; 2])")
-                }
-            }
-
-            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
-            where
-                A: serde::de::SeqAccess<'de>,
-            {
-                let s = serde::de::value::SeqAccessDeserializer::new(seq);
-                let id: [u8; 2] = Deserialize::deserialize(s)?;
-                Ok(ShardIndex::from(id))
-            }
-
-            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
-            where
-                E: serde::de::Error,
-            {
-                ShardIndex::from_str(v).map_err(E::custom)
-            }
-        }
-
-        if deserializer.is_human_readable() {
-            deserializer.deserialize_str(IdVisitor {
-                is_human_readable_deserializer: true,
-            })
-        } else {
-            deserializer.deserialize_tuple(
-                2,
-                IdVisitor {
-                    is_human_readable_deserializer: false,
-                },
-            )
-        }
-    }
-}
-
 /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
 /// in order to be able to serve basebackup requests without peer communication).
 fn key_is_shard0(key: &Key) -> bool {
@@ -737,7 +293,9 @@ pub fn describe(
 
 #[cfg(test)]
 mod tests {
-    use utils::Hex;
+    use std::str::FromStr;
+
+    use utils::{id::TenantId, Hex};
 
     use super::*;
 
diff --git a/libs/postgres_backend/Cargo.toml b/libs/postgres_backend/Cargo.toml
index 8e249c09f7e0..c7611b9f213d 100644
--- a/libs/postgres_backend/Cargo.toml
+++ b/libs/postgres_backend/Cargo.toml
@@ -13,6 +13,7 @@ rustls.workspace = true
 serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true
 tokio-rustls.workspace = true
 tracing.workspace = true
 
@@ -23,4 +24,4 @@ workspace_hack.workspace = true
 once_cell.workspace = true
 rustls-pemfile.workspace = true
 tokio-postgres.workspace = true
-tokio-postgres-rustls.workspace = true
\ No newline at end of file
+tokio-postgres-rustls.workspace = true
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 6c41b7f347a9..7c7c6535b338 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -16,6 +16,7 @@ use std::{fmt, io};
 use std::{future::Future, str::FromStr};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio_rustls::TlsAcceptor;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn};
 
 use pq_proto::framed::{ConnectionError, Framed, FramedReader, FramedWriter};
@@ -400,21 +401,15 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
     }
 
     /// Wrapper for run_message_loop() that shuts down socket when we are done
-    pub async fn run<F, S>(
+    pub async fn run(
         mut self,
         handler: &mut impl Handler<IO>,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S + Clone,
-        S: Future,
-    {
-        let ret = self
-            .run_message_loop(handler, shutdown_watcher.clone())
-            .await;
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
+        let ret = self.run_message_loop(handler, cancel).await;
 
         tokio::select! {
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                 // do nothing; we most likely got already stopped by shutdown and will log it next.
             }
             _ = self.framed.shutdown() => {
@@ -444,21 +439,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         }
     }
 
-    async fn run_message_loop<F, S>(
+    async fn run_message_loop(
         &mut self,
         handler: &mut impl Handler<IO>,
-        shutdown_watcher: F,
-    ) -> Result<(), QueryError>
-    where
-        F: Fn() -> S,
-        S: Future,
-    {
+        cancel: &CancellationToken,
+    ) -> Result<(), QueryError> {
         trace!("postgres backend to {:?} started", self.peer_addr);
 
         tokio::select!(
             biased;
 
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                 // We were requested to shut down.
                 tracing::info!("shutdown request received during handshake");
                 return Err(QueryError::Shutdown)
@@ -473,7 +464,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         let mut query_string = Bytes::new();
         while let Some(msg) = tokio::select!(
             biased;
-            _ = shutdown_watcher() => {
+            _ = cancel.cancelled() => {
                 // We were requested to shut down.
                 tracing::info!("shutdown request received in run_message_loop");
                 return Err(QueryError::Shutdown)
@@ -485,7 +476,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
             let result = self.process_message(handler, msg, &mut query_string).await;
             tokio::select!(
                 biased;
-                _ = shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     // We were requested to shut down.
                     tracing::info!("shutdown request received during response flush");
 
@@ -672,11 +663,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         assert!(self.state < ProtoState::Authentication);
         let have_tls = self.tls_config.is_some();
         match msg {
-            FeStartupPacket::SslRequest => {
+            FeStartupPacket::SslRequest { direct } => {
                 debug!("SSL requested");
 
-                self.write_message(&BeMessage::EncryptionResponse(have_tls))
-                    .await?;
+                if !direct {
+                    self.write_message(&BeMessage::EncryptionResponse(have_tls))
+                        .await?;
+                } else if !have_tls {
+                    return Err(QueryError::Other(anyhow::anyhow!(
+                        "direct SSL negotiation but no TLS support"
+                    )));
+                }
 
                 if have_tls {
                     self.start_tls().await?;
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index 80df9db858a9..7ec85f0dbe90 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -3,13 +3,14 @@ use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
 use std::io::Cursor;
-use std::{future, sync::Arc};
+use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::{TcpListener, TcpStream};
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres::{Config, NoTls, SimpleQueryMessage};
 use tokio_postgres_rustls::MakeRustlsConnect;
+use tokio_util::sync::CancellationToken;
 
 // generate client, server test streams
 async fn make_tcp_pair() -> (TcpStream, TcpStream) {
@@ -50,7 +51,7 @@ async fn simple_select() {
 
     tokio::spawn(async move {
         let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, future::pending::<()>).await
+        pgbackend.run(&mut handler, &CancellationToken::new()).await
     });
 
     let conf = Config::new();
@@ -102,7 +103,7 @@ async fn simple_select_ssl() {
 
     tokio::spawn(async move {
         let mut handler = TestHandler {};
-        pgbackend.run(&mut handler, future::pending::<()>).await
+        pgbackend.run(&mut handler, &CancellationToken::new()).await
     });
 
     let client_cfg = rustls::ClientConfig::builder()
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 0bbb91afc282..d25b23663bf6 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -356,6 +356,28 @@ impl CheckPoint {
         }
         false
     }
+
+    /// Advance next multi-XID/offset to those given in arguments.
+    ///
+    /// It's important that this handles wraparound correctly. This should match the
+    /// MultiXactAdvanceNextMXact() logic in PostgreSQL's xlog_redo() function.
+    ///
+    /// Returns 'true' if the Checkpoint was updated.
+    pub fn update_next_multixid(&mut self, multi_xid: u32, multi_offset: u32) -> bool {
+        let mut modified = false;
+
+        if multi_xid.wrapping_sub(self.nextMulti) as i32 > 0 {
+            self.nextMulti = multi_xid;
+            modified = true;
+        }
+
+        if multi_offset.wrapping_sub(self.nextMultiOffset) as i32 > 0 {
+            self.nextMultiOffset = multi_offset;
+            modified = true;
+        }
+
+        modified
+    }
 }
 
 /// Generate new, empty WAL segment, with correct block headers at the first
diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
index 496458b2e42d..750affc94eed 100644
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -202,6 +202,53 @@ pub fn test_update_next_xid() {
     assert_eq!(checkpoint.nextXid.value, 2048);
 }
 
+#[test]
+pub fn test_update_next_multixid() {
+    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
+    let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
+
+    // simple case
+    checkpoint.nextMulti = 20;
+    checkpoint.nextMultiOffset = 20;
+    checkpoint.update_next_multixid(1000, 2000);
+    assert_eq!(checkpoint.nextMulti, 1000);
+    assert_eq!(checkpoint.nextMultiOffset, 2000);
+
+    // No change
+    checkpoint.update_next_multixid(500, 900);
+    assert_eq!(checkpoint.nextMulti, 1000);
+    assert_eq!(checkpoint.nextMultiOffset, 2000);
+
+    // Close to wraparound, but not wrapped around yet
+    checkpoint.nextMulti = 0xffff0000;
+    checkpoint.nextMultiOffset = 0xfffe0000;
+    checkpoint.update_next_multixid(0xffff00ff, 0xfffe00ff);
+    assert_eq!(checkpoint.nextMulti, 0xffff00ff);
+    assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff);
+
+    // Wraparound
+    checkpoint.update_next_multixid(1, 900);
+    assert_eq!(checkpoint.nextMulti, 1);
+    assert_eq!(checkpoint.nextMultiOffset, 900);
+
+    // Wraparound nextMulti to 0.
+    //
+    // It's a bit surprising that nextMulti can be 0, because that's a special value
+    // (InvalidMultiXactId). However, that's how Postgres does it at multi-xid wraparound:
+    // nextMulti wraps around to 0, but then when the next multi-xid is assigned, it skips
+    // the 0 and the next multi-xid actually assigned is 1.
+    checkpoint.nextMulti = 0xffff0000;
+    checkpoint.nextMultiOffset = 0xfffe0000;
+    checkpoint.update_next_multixid(0, 0xfffe00ff);
+    assert_eq!(checkpoint.nextMulti, 0);
+    assert_eq!(checkpoint.nextMultiOffset, 0xfffe00ff);
+
+    // Wraparound nextMultiOffset to 0
+    checkpoint.update_next_multixid(0, 0);
+    assert_eq!(checkpoint.nextMulti, 0);
+    assert_eq!(checkpoint.nextMultiOffset, 0);
+}
+
 #[test]
 pub fn test_encode_logical_message() {
     let expected = [
diff --git a/libs/pq_proto/src/framed.rs b/libs/pq_proto/src/framed.rs
index 6e97b8c2a02c..ccbb90e3842e 100644
--- a/libs/pq_proto/src/framed.rs
+++ b/libs/pq_proto/src/framed.rs
@@ -44,9 +44,9 @@ impl ConnectionError {
 /// Wraps async io `stream`, providing messages to write/flush + read Postgres
 /// messages.
 pub struct Framed<S> {
-    stream: S,
-    read_buf: BytesMut,
-    write_buf: BytesMut,
+    pub stream: S,
+    pub read_buf: BytesMut,
+    pub write_buf: BytesMut,
 }
 
 impl<S> Framed<S> {
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index cee374201763..a01191bd5de3 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -39,14 +39,39 @@ pub enum FeMessage {
     PasswordMessage(Bytes),
 }
 
+#[derive(Clone, Copy, PartialEq, PartialOrd)]
+pub struct ProtocolVersion(u32);
+
+impl ProtocolVersion {
+    pub const fn new(major: u16, minor: u16) -> Self {
+        Self((major as u32) << 16 | minor as u32)
+    }
+    pub const fn minor(self) -> u16 {
+        self.0 as u16
+    }
+    pub const fn major(self) -> u16 {
+        (self.0 >> 16) as u16
+    }
+}
+
+impl fmt::Debug for ProtocolVersion {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_list()
+            .entry(&self.major())
+            .entry(&self.minor())
+            .finish()
+    }
+}
+
 #[derive(Debug)]
 pub enum FeStartupPacket {
     CancelRequest(CancelKeyData),
-    SslRequest,
+    SslRequest {
+        direct: bool,
+    },
     GssEncRequest,
     StartupMessage {
-        major_version: u32,
-        minor_version: u32,
+        version: ProtocolVersion,
         params: StartupMessageParams,
     },
 }
@@ -301,11 +326,23 @@ impl FeStartupPacket {
     /// different from [`FeMessage::parse`] because startup messages don't have
     /// message type byte; otherwise, its comments apply.
     pub fn parse(buf: &mut BytesMut) -> Result<Option<FeStartupPacket>, ProtocolError> {
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L118>
         const MAX_STARTUP_PACKET_LENGTH: usize = 10000;
-        const RESERVED_INVALID_MAJOR_VERSION: u32 = 1234;
-        const CANCEL_REQUEST_CODE: u32 = 5678;
-        const NEGOTIATE_SSL_CODE: u32 = 5679;
-        const NEGOTIATE_GSS_CODE: u32 = 5680;
+        const RESERVED_INVALID_MAJOR_VERSION: u16 = 1234;
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L132>
+        const CANCEL_REQUEST_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5678);
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L166>
+        const NEGOTIATE_SSL_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5679);
+        /// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L167>
+        const NEGOTIATE_GSS_CODE: ProtocolVersion = ProtocolVersion::new(1234, 5680);
+
+        // <https://github.com/postgres/postgres/blob/04bcf9e19a4261fe9c7df37c777592c2e10c32a7/src/backend/tcop/backend_startup.c#L378-L382>
+        // First byte indicates standard SSL handshake message
+        // (It can't be a Postgres startup length because in network byte order
+        // that would be a startup packet hundreds of megabytes long)
+        if buf.first() == Some(&0x16) {
+            return Ok(Some(FeStartupPacket::SslRequest { direct: true }));
+        }
 
         // need at least 4 bytes with packet len
         if buf.len() < 4 {
@@ -338,12 +375,10 @@ impl FeStartupPacket {
         let mut msg = buf.split_to(len).freeze();
         msg.advance(4); // consume len
 
-        let request_code = msg.get_u32();
-        let req_hi = request_code >> 16;
-        let req_lo = request_code & ((1 << 16) - 1);
+        let request_code = ProtocolVersion(msg.get_u32());
         // StartupMessage, CancelRequest, SSLRequest etc are differentiated by request code.
-        let message = match (req_hi, req_lo) {
-            (RESERVED_INVALID_MAJOR_VERSION, CANCEL_REQUEST_CODE) => {
+        let message = match request_code {
+            CANCEL_REQUEST_CODE => {
                 if msg.remaining() != 8 {
                     return Err(ProtocolError::BadMessage(
                         "CancelRequest message is malformed, backend PID / secret key missing"
@@ -355,21 +390,22 @@ impl FeStartupPacket {
                     cancel_key: msg.get_i32(),
                 })
             }
-            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_SSL_CODE) => {
+            NEGOTIATE_SSL_CODE => {
                 // Requested upgrade to SSL (aka TLS)
-                FeStartupPacket::SslRequest
+                FeStartupPacket::SslRequest { direct: false }
             }
-            (RESERVED_INVALID_MAJOR_VERSION, NEGOTIATE_GSS_CODE) => {
+            NEGOTIATE_GSS_CODE => {
                 // Requested upgrade to GSSAPI
                 FeStartupPacket::GssEncRequest
             }
-            (RESERVED_INVALID_MAJOR_VERSION, unrecognized_code) => {
+            version if version.major() == RESERVED_INVALID_MAJOR_VERSION => {
                 return Err(ProtocolError::Protocol(format!(
-                    "Unrecognized request code {unrecognized_code}"
+                    "Unrecognized request code {}",
+                    version.minor()
                 )));
             }
             // TODO bail if protocol major_version is not 3?
-            (major_version, minor_version) => {
+            version => {
                 // StartupMessage
 
                 let s = str::from_utf8(&msg).map_err(|_e| {
@@ -382,8 +418,7 @@ impl FeStartupPacket {
                 })?;
 
                 FeStartupPacket::StartupMessage {
-                    major_version,
-                    minor_version,
+                    version,
                     params: StartupMessageParams {
                         params: msg.slice_ref(s.as_bytes()),
                     },
@@ -522,6 +557,10 @@ pub enum BeMessage<'a> {
     RowDescription(&'a [RowDescriptor<'a>]),
     XLogData(XLogDataBody<'a>),
     NoticeResponse(&'a str),
+    NegotiateProtocolVersion {
+        version: ProtocolVersion,
+        options: &'a [&'a str],
+    },
     KeepAlive(WalSndKeepAlive),
 }
 
@@ -945,6 +984,18 @@ impl<'a> BeMessage<'a> {
                     buf.put_u8(u8::from(req.request_reply));
                 });
             }
+
+            BeMessage::NegotiateProtocolVersion { version, options } => {
+                buf.put_u8(b'v');
+                write_body(buf, |buf| {
+                    buf.put_u32(version.0);
+                    buf.put_u32(options.len() as u32);
+                    for option in options.iter() {
+                        write_cstr(option, buf)?;
+                    }
+                    Ok(())
+                })?
+            }
         }
         Ok(())
     }
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index 8a8f6212e99b..fa3f2cba58d7 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -1,6 +1,5 @@
 use std::{fmt::Debug, num::NonZeroUsize, str::FromStr, time::Duration};
 
-use anyhow::bail;
 use aws_sdk_s3::types::StorageClass;
 use camino::Utf8PathBuf;
 
@@ -176,20 +175,8 @@ fn serialize_storage_class<S: serde::Serializer>(
 impl RemoteStorageConfig {
     pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
 
-    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
-        let document: toml_edit::Document = match toml {
-            toml_edit::Item::Table(toml) => toml.clone().into(),
-            toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
-                toml.clone().into_table().into()
-            }
-            _ => bail!("toml not a table or inline table"),
-        };
-
-        if document.is_empty() {
-            return Ok(None);
-        }
-
-        Ok(Some(toml_edit::de::from_document(document)?))
+    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<RemoteStorageConfig> {
+        Ok(utils::toml_edit_ext::deserialize_item(toml)?)
     }
 }
 
@@ -197,7 +184,7 @@ impl RemoteStorageConfig {
 mod tests {
     use super::*;
 
-    fn parse(input: &str) -> anyhow::Result<Option<RemoteStorageConfig>> {
+    fn parse(input: &str) -> anyhow::Result<RemoteStorageConfig> {
         let toml = input.parse::<toml_edit::Document>().unwrap();
         RemoteStorageConfig::from_toml(toml.as_item())
     }
@@ -207,7 +194,7 @@ mod tests {
         let input = "local_path = '.'
 timeout = '5s'";
 
-        let config = parse(input).unwrap().expect("it exists");
+        let config = parse(input).unwrap();
 
         assert_eq!(
             config,
@@ -229,7 +216,7 @@ timeout = '5s'";
     timeout = '7s'
     ";
 
-        let config = parse(toml).unwrap().expect("it exists");
+        let config = parse(toml).unwrap();
 
         assert_eq!(
             config,
@@ -257,7 +244,7 @@ timeout = '5s'";
     timeout = '7s'
     ";
 
-        let config = parse(toml).unwrap().expect("it exists");
+        let config = parse(toml).unwrap();
 
         assert_eq!(
             config,
diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs
index f05997ee6547..be005622199d 100644
--- a/libs/tenant_size_model/src/calculation.rs
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -34,10 +34,10 @@ struct SegmentSize {
 }
 
 struct SizeAlternatives {
-    // cheapest alternative if parent is available.
+    /// cheapest alternative if parent is available.
     incremental: SegmentSize,
 
-    // cheapest alternative if parent node is not available
+    /// cheapest alternative if parent node is not available
     non_incremental: Option<SegmentSize>,
 }
 
diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs
index f26d3aa79d1a..0de2890bb414 100644
--- a/libs/tenant_size_model/src/svg.rs
+++ b/libs/tenant_size_model/src/svg.rs
@@ -3,10 +3,17 @@ use std::fmt::Write;
 
 const SVG_WIDTH: f32 = 500.0;
 
+/// Different branch kind for SVG drawing.
+#[derive(PartialEq)]
+pub enum SvgBranchKind {
+    Timeline,
+    Lease,
+}
+
 struct SvgDraw<'a> {
     storage: &'a StorageModel,
     branches: &'a [String],
-    seg_to_branch: &'a [usize],
+    seg_to_branch: &'a [(usize, SvgBranchKind)],
     sizes: &'a [SegmentSizeResult],
 
     // layout
@@ -42,13 +49,18 @@ fn draw_legend(result: &mut String) -> anyhow::Result<()> {
         "<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
     )?;
     writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
+    writeln!(
+        result,
+        "<line x1=\"10\" y1=\"85\" x2=\"10\" y2=\"95\" stroke-width=\"3\" stroke=\"blue\" />"
+    )?;
+    writeln!(result, "<text x=\"20\" y=\"95\">LSN lease</text>")?;
     Ok(())
 }
 
 pub fn draw_svg(
     storage: &StorageModel,
     branches: &[String],
-    seg_to_branch: &[usize],
+    seg_to_branch: &[(usize, SvgBranchKind)],
     sizes: &SizeResult,
 ) -> anyhow::Result<String> {
     let mut draw = SvgDraw {
@@ -100,7 +112,7 @@ impl<'a> SvgDraw<'a> {
 
         // Layout the timelines on Y dimension.
         // TODO
-        let mut y = 100.0;
+        let mut y = 120.0;
         let mut branch_y_coordinates = Vec::new();
         for _branch in self.branches {
             branch_y_coordinates.push(y);
@@ -109,7 +121,7 @@ impl<'a> SvgDraw<'a> {
 
         // Calculate coordinates for each point
         let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
-            .map(|(seg, branch_id)| {
+            .map(|(seg, (branch_id, _))| {
                 let x = (seg.lsn - min_lsn) as f32 / xscale;
                 let y = branch_y_coordinates[*branch_id];
                 (x, y)
@@ -175,6 +187,22 @@ impl<'a> SvgDraw<'a> {
 
         // draw a snapshot point if it's needed
         let (coord_x, coord_y) = self.seg_coordinates[seg_id];
+
+        let (_, kind) = &self.seg_to_branch[seg_id];
+        if kind == &SvgBranchKind::Lease {
+            let (x1, y1) = (coord_x, coord_y - 10.0);
+            let (x2, y2) = (coord_x, coord_y + 10.0);
+
+            let style = "stroke-width=\"3\" stroke=\"blue\"";
+
+            writeln!(
+                result,
+                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
+            )?;
+            writeln!(result, "  <title>leased lsn at {}</title>", seg.lsn)?;
+            writeln!(result, "</line>")?;
+        }
+
         if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
             writeln!(
                 result,
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index a6a081c5c144..261ca2cc1ac0 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -40,6 +40,7 @@ thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
+toml_edit.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
 tracing-subscriber = { workspace = true, features = ["json", "registry"] }
diff --git a/libs/utils/src/http/request.rs b/libs/utils/src/http/request.rs
index 766bbfc9dfae..8b8ed5a67f39 100644
--- a/libs/utils/src/http/request.rs
+++ b/libs/utils/src/http/request.rs
@@ -74,6 +74,15 @@ pub fn parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
         .transpose()
 }
 
+pub fn must_parse_query_param<E: fmt::Display, T: FromStr<Err = E>>(
+    request: &Request<Body>,
+    param_name: &str,
+) -> Result<T, ApiError> {
+    parse_query_param(request, param_name)?.ok_or_else(|| {
+        ApiError::BadRequest(anyhow!("no {param_name} specified in query parameters"))
+    })
+}
+
 pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError> {
     match request.body_mut().data().await {
         Some(_) => Err(ApiError::BadRequest(anyhow!("Unexpected request body"))),
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 2953f0aad4fd..711e617801ea 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,6 +26,8 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
 
+pub mod shard;
+
 mod hex;
 pub use hex::Hex;
 
@@ -94,6 +96,8 @@ pub mod env;
 
 pub mod poison;
 
+pub mod toml_edit_ext;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
new file mode 100644
index 000000000000..4f9ac6bdb49a
--- /dev/null
+++ b/libs/utils/src/shard.rs
@@ -0,0 +1,451 @@
+//! See `pageserver_api::shard` for description on sharding.
+
+use std::{ops::RangeInclusive, str::FromStr};
+
+use hex::FromHex;
+use serde::{Deserialize, Serialize};
+
+use crate::id::TenantId;
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardNumber(pub u8);
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
+pub struct ShardCount(pub u8);
+
+/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
+/// when we need to know which shard we're dealing with, but do not need to know the full
+/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
+/// the fully qualified TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+/// Formatting helper, for generating the `shard_id` label in traces.
+pub struct ShardSlug<'a>(&'a TenantShardId);
+
+/// TenantShardId globally identifies a particular shard in a particular tenant.
+///
+/// These are written as `<TenantId>-<ShardSlug>`, for example:
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
+/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
+/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
+///
+/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible with TenantId: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl ShardCount {
+    pub const MAX: Self = Self(u8::MAX);
+
+    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
+    /// legacy format for TenantShardId that excludes the shard suffix", also known
+    /// as [`TenantShardId::unsharded`].
+    ///
+    /// This method returns the actual number of shards, i.e. if our internal value is
+    /// zero, we return 1 (unsharded tenants have 1 shard).
+    pub fn count(&self) -> u8 {
+        if self.0 > 0 {
+            self.0
+        } else {
+            1
+        }
+    }
+
+    /// The literal internal value: this is **not** the number of shards in the
+    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
+    /// [`Self::count`] if you want to know the cardinality of shards.
+    pub fn literal(&self) -> u8 {
+        self.0
+    }
+
+    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
+    /// uses the legacy format for `TenantShardId`. See also the documentation for
+    /// [`Self::count`].
+    pub fn is_unsharded(&self) -> bool {
+        self.0 == 0
+    }
+
+    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
+    /// [`Self::literal`] would return.
+    pub const fn new(val: u8) -> Self {
+        Self(val)
+    }
+}
+
+impl ShardNumber {
+    pub const MAX: Self = Self(u8::MAX);
+}
+
+impl TenantShardId {
+    pub fn unsharded(tenant_id: TenantId) -> Self {
+        Self {
+            tenant_id,
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The range of all TenantShardId that belong to a particular TenantId.  This is useful when
+    /// you have a BTreeMap of TenantShardId, and are querying by TenantId.
+    pub fn tenant_range(tenant_id: TenantId) -> RangeInclusive<Self> {
+        RangeInclusive::new(
+            Self {
+                tenant_id,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            },
+            Self {
+                tenant_id,
+                shard_number: ShardNumber::MAX,
+                shard_count: ShardCount::MAX,
+            },
+        )
+    }
+
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
+        ShardSlug(self)
+    }
+
+    /// Convenience for code that has special behavior on the 0th shard.
+    pub fn is_shard_zero(&self) -> bool {
+        self.shard_number == ShardNumber(0)
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
+    }
+
+    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
+    /// is useful when logging from code that is already in a span that includes tenant ID, to
+    /// keep messages reasonably terse.
+    pub fn to_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_number: self.shard_number,
+            shard_count: self.shard_count,
+        }
+    }
+
+    /// Calculate the children of this TenantShardId when splitting the overall tenant into
+    /// the given number of shards.
+    pub fn split(&self, new_shard_count: ShardCount) -> Vec<TenantShardId> {
+        let effective_old_shard_count = std::cmp::max(self.shard_count.0, 1);
+        let mut child_shards = Vec::new();
+        for shard_number in 0..ShardNumber(new_shard_count.0).0 {
+            // Key mapping is based on a round robin mapping of key hash modulo shard count,
+            // so our child shards are the ones which the same keys would map to.
+            if shard_number % effective_old_shard_count == self.shard_number.0 {
+                child_shards.push(TenantShardId {
+                    tenant_id: self.tenant_id,
+                    shard_number: ShardNumber(shard_number),
+                    shard_count: new_shard_count,
+                })
+            }
+        }
+
+        child_shards
+    }
+}
+
+impl<'a> std::fmt::Display for ShardSlug<'a> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "{:02x}{:02x}",
+            self.0.shard_number.0, self.0.shard_count.0
+        )
+    }
+}
+
+impl std::fmt::Display for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        if self.shard_count != ShardCount(0) {
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
+        } else {
+            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
+            // is distinct from the normal single shard case (shard count == 1).
+            self.tenant_id.fmt(f)
+        }
+    }
+}
+
+impl std::fmt::Debug for TenantShardId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for TenantShardId {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 16 byte TenantId, '-', 1 byte shard number, 1 byte shard count
+        if s.len() == 32 {
+            // Legacy case: no shard specified
+            Ok(Self {
+                tenant_id: TenantId::from_str(s)?,
+                shard_number: ShardNumber(0),
+                shard_count: ShardCount(0),
+            })
+        } else if s.len() == 37 {
+            let bytes = s.as_bytes();
+            let tenant_id = TenantId::from_hex(&bytes[0..32])?;
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(&bytes[33..37], &mut shard_parts)?;
+            Ok(Self {
+                tenant_id,
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 18]> for TenantShardId {
+    fn from(b: [u8; 18]) -> Self {
+        let tenant_id_bytes: [u8; 16] = b[0..16].try_into().unwrap();
+
+        Self {
+            tenant_id: TenantId::from(tenant_id_bytes),
+            shard_number: ShardNumber(b[16]),
+            shard_count: ShardCount(b[17]),
+        }
+    }
+}
+
+impl ShardIndex {
+    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            shard_number: number,
+            shard_count: count,
+        }
+    }
+    pub fn unsharded() -> Self {
+        Self {
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    /// The "unsharded" value is distinct from simply having a single shard: it represents
+    /// a tenant which is not shard-aware at all, and whose storage paths will not include
+    /// a shard suffix.
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
+
+    /// For use in constructing remote storage paths: concatenate this with a TenantId
+    /// to get a fully qualified TenantShardId.
+    ///
+    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
+    /// that the legacy pre-sharding remote key format is preserved.
+    pub fn get_suffix(&self) -> String {
+        if self.is_unsharded() {
+            "".to_string()
+        } else {
+            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+        }
+    }
+}
+
+impl std::fmt::Display for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Debug for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for ShardIndex {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 1 byte shard number, 1 byte shard count
+        if s.len() == 4 {
+            let bytes = s.as_bytes();
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(bytes, &mut shard_parts)?;
+            Ok(Self {
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 2]> for ShardIndex {
+    fn from(b: [u8; 2]) -> Self {
+        Self {
+            shard_number: ShardNumber(b[0]),
+            shard_count: ShardCount(b[1]),
+        }
+    }
+}
+
+impl Serialize for TenantShardId {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Note: while human encoding of [`TenantShardId`] is backward and forward
+            // compatible, this binary encoding is not.
+            let mut packed: [u8; 18] = [0; 18];
+            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
+            packed[16] = self.shard_number.0;
+            packed[17] = self.shard_count.0;
+
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for TenantShardId {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = TenantShardId;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 18])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 18] = Deserialize::deserialize(s)?;
+                Ok(TenantShardId::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                TenantShardId::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                18,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
+impl Serialize for ShardIndex {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Binary encoding is not used in index_part.json, but is included in anticipation of
+            // switching various structures (e.g. inter-process communication, remote metadata) to more
+            // compact binary encodings in future.
+            let mut packed: [u8; 2] = [0; 2];
+            packed[0] = self.shard_number.0;
+            packed[1] = self.shard_count.0;
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for ShardIndex {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = ShardIndex;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 2])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 2] = Deserialize::deserialize(s)?;
+                Ok(ShardIndex::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                ShardIndex::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                2,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
diff --git a/libs/utils/src/toml_edit_ext.rs b/libs/utils/src/toml_edit_ext.rs
new file mode 100644
index 000000000000..ab5f7bdd95ab
--- /dev/null
+++ b/libs/utils/src/toml_edit_ext.rs
@@ -0,0 +1,22 @@
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("item is not a document")]
+    ItemIsNotADocument,
+    #[error(transparent)]
+    Serde(toml_edit::de::Error),
+}
+
+pub fn deserialize_item<T>(item: &toml_edit::Item) -> Result<T, Error>
+where
+    T: serde::de::DeserializeOwned,
+{
+    let document: toml_edit::Document = match item {
+        toml_edit::Item::Table(toml) => toml.clone().into(),
+        toml_edit::Item::Value(toml_edit::Value::InlineTable(toml)) => {
+            toml.clone().into_table().into()
+        }
+        _ => return Err(Error::ItemIsNotADocument),
+    };
+
+    toml_edit::de::from_document(document).map_err(Error::Serde)
+}
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 4335f38f1e7f..0d9343d64382 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -62,6 +62,7 @@ sync_wrapper.workspace = true
 sysinfo.workspace = true
 tokio-tar.workspace = true
 thiserror.workspace = true
+tikv-jemallocator.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time"] }
 tokio-epoll-uring.workspace = true
 tokio-io-timeout.workspace = true
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 5aab10e5d9c0..edc09d0bf22a 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -48,6 +48,7 @@
 //! medium/128        time:   [8.8311 ms 8.9849 ms 9.1263 ms]
 //! ```
 
+use anyhow::Context;
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
 use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
@@ -188,6 +189,7 @@ impl Request {
         manager
             .request_redo(*key, *lsn, base_img.clone(), records.clone(), *pg_version)
             .await
+            .context("request_redo")
     }
 
     fn pg_record(will_init: bool, bytes: &'static [u8]) -> NeonWalRecord {
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
index 0ed27602cd3c..a938367334fa 100644
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 pageserver_api.workspace = true
 thiserror.workspace = true
 async-trait.workspace = true
-reqwest.workspace = true
+reqwest = { workspace = true, features = [ "stream" ] }
 utils.workspace = true
 serde.workspace = true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 69b86d9c466a..e3ddb446fa2c 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -9,6 +9,8 @@ use utils::{
     lsn::Lsn,
 };
 
+pub use reqwest::Body as ReqwestBody;
+
 pub mod util;
 
 #[derive(Debug, Clone)]
@@ -20,6 +22,9 @@ pub struct Client {
 
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
+    #[error("send request: {0}")]
+    SendRequest(reqwest::Error),
+
     #[error("receive body: {0}")]
     ReceiveBody(reqwest::Error),
 
@@ -173,19 +178,30 @@ impl Client {
         self.request(Method::GET, uri, ()).await
     }
 
-    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
+    fn start_request<U: reqwest::IntoUrl>(
         &self,
         method: Method,
         uri: U,
-        body: B,
-    ) -> Result<reqwest::Response> {
+    ) -> reqwest::RequestBuilder {
         let req = self.client.request(method, uri);
-        let req = if let Some(value) = &self.authorization_header {
+        if let Some(value) = &self.authorization_header {
             req.header(reqwest::header::AUTHORIZATION, value)
         } else {
             req
-        };
-        req.json(&body).send().await.map_err(Error::ReceiveBody)
+        }
+    }
+
+    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
+        &self,
+        method: Method,
+        uri: U,
+        body: B,
+    ) -> Result<reqwest::Response> {
+        self.start_request(method, uri)
+            .json(&body)
+            .send()
+            .await
+            .map_err(Error::ReceiveBody)
     }
 
     async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
@@ -205,15 +221,6 @@ impl Client {
         Ok(())
     }
 
-    pub async fn tenant_create(&self, req: &TenantCreateRequest) -> Result<TenantId> {
-        let uri = format!("{}/v1/tenant", self.mgmt_api_endpoint);
-        self.request(Method::POST, &uri, req)
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
     /// The tenant deletion API can return 202 if deletion is incomplete, or
     /// 404 if it is complete.  Callers are responsible for checking the status
     /// code and retrying.  Error codes other than 404 will return Err().
@@ -618,4 +625,53 @@ impl Client {
             }),
         }
     }
+
+    pub async fn import_basebackup(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        base_lsn: Lsn,
+        end_lsn: Lsn,
+        pg_version: u32,
+        basebackup_tarball: ReqwestBody,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_basebackup?base_lsn={base_lsn}&end_lsn={end_lsn}&pg_version={pg_version}",
+            self.mgmt_api_endpoint,
+        );
+        self.start_request(Method::PUT, uri)
+            .body(basebackup_tarball)
+            .send()
+            .await
+            .map_err(Error::SendRequest)?
+            .error_from_body()
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn import_wal(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        start_lsn: Lsn,
+        end_lsn: Lsn,
+        wal_tarball: ReqwestBody,
+    ) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}/import_wal?start_lsn={start_lsn}&end_lsn={end_lsn}",
+            self.mgmt_api_endpoint,
+        );
+        self.start_request(Method::PUT, uri)
+            .body(wal_tarball)
+            .send()
+            .await
+            .map_err(Error::SendRequest)?
+            .error_from_body()
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 50c3ac4c6143..ea09a011e5cf 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -178,7 +178,7 @@ async fn main() -> anyhow::Result<()> {
             let toml_item = toml_document
                 .get("remote_storage")
                 .expect("need remote_storage");
-            let config = RemoteStorageConfig::from_toml(toml_item)?.expect("incomplete config");
+            let config = RemoteStorageConfig::from_toml(toml_item)?;
             let storage = remote_storage::GenericRemoteStorage::from_config(&config);
             let cancel = CancellationToken::new();
             storage
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 0f057a43683c..207f781e1b27 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -348,35 +348,36 @@ where
                     self.add_rel(rel, rel).await?;
                 }
             }
+        }
 
-            for (path, content) in self
-                .timeline
-                .list_aux_files(self.lsn, self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
-            {
-                if path.starts_with("pg_replslot") {
-                    let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
-                    let restart_lsn = Lsn(u64::from_le_bytes(
-                        content[offs..offs + 8].try_into().unwrap(),
-                    ));
-                    info!("Replication slot {} restart LSN={}", path, restart_lsn);
-                    min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
-                } else if path == "pg_logical/replorigin_checkpoint" {
-                    // replorigin_checkoint is written only on compute shutdown, so it contains
-                    // deteriorated values. So we generate our own version of this file for the particular LSN
-                    // based on information about replorigins extracted from transaction commit records.
-                    // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
-                    // but now we should handle (skip) it for backward compatibility.
-                    continue;
-                }
-                let header = new_tar_header(&path, content.len() as u64)?;
-                self.ar
-                    .append(&header, &*content)
-                    .await
-                    .context("could not add aux file to basebackup tarball")?;
+        for (path, content) in self
+            .timeline
+            .list_aux_files(self.lsn, self.ctx)
+            .await
+            .map_err(|e| BasebackupError::Server(e.into()))?
+        {
+            if path.starts_with("pg_replslot") {
+                let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
+                let restart_lsn = Lsn(u64::from_le_bytes(
+                    content[offs..offs + 8].try_into().unwrap(),
+                ));
+                info!("Replication slot {} restart LSN={}", path, restart_lsn);
+                min_restart_lsn = Lsn::min(min_restart_lsn, restart_lsn);
+            } else if path == "pg_logical/replorigin_checkpoint" {
+                // replorigin_checkoint is written only on compute shutdown, so it contains
+                // deteriorated values. So we generate our own version of this file for the particular LSN
+                // based on information about replorigins extracted from transaction commit records.
+                // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
+                // but now we should handle (skip) it for backward compatibility.
+                continue;
             }
+            let header = new_tar_header(&path, content.len() as u64)?;
+            self.ar
+                .append(&header, &*content)
+                .await
+                .context("could not add aux file to basebackup tarball")?;
         }
+
         if min_restart_lsn != Lsn::MAX {
             info!(
                 "Min restart LSN for logical replication is {}",
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index ba5b2608bdf1..9f705f0bc923 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -47,6 +47,9 @@ use utils::{
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
 const PID_FILE_NAME: &str = "pageserver.pid";
 
 const FEATURES: &[&str] = &[
@@ -421,6 +424,10 @@ fn start_pageserver(
         background_jobs_can_start: background_jobs_barrier.clone(),
     };
 
+    info!(config=?conf.l0_flush, "using l0_flush config");
+    let l0_flush_global_state =
+        pageserver::l0_flush::L0FlushGlobalState::new(conf.l0_flush.clone());
+
     // Scan the local 'tenants/' directory and start loading the tenants
     let deletion_queue_client = deletion_queue.new_client();
     let tenant_manager = BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
@@ -429,6 +436,7 @@ fn start_pageserver(
             broker_client: broker_client.clone(),
             remote_storage: remote_storage.clone(),
             deletion_queue_client,
+            l0_flush_global_state,
         },
         order,
         shutdown_pageserver.clone(),
@@ -652,7 +660,6 @@ fn start_pageserver(
                 async move {
                     page_service::libpq_listener_main(
                         tenant_manager,
-                        broker_client,
                         pg_auth,
                         pageserver_listener,
                         conf.pg_auth_type,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f36e63f035c7..17bc427b2cf1 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,7 +5,7 @@
 //! See also `settings.md` for better description on every parameter.
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{models::ImageCompressionAlgorithm, shard::TenantShardId};
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde;
 use serde::de::IntoDeserializer;
@@ -30,11 +30,11 @@ use utils::{
     logging::LogFormat,
 };
 
-use crate::tenant::timeline::GetVectoredImpl;
 use crate::tenant::vectored_blob_io::MaxVectoredReadBytes;
 use crate::tenant::{config::TenantConfOpt, timeline::GetImpl};
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use crate::{disk_usage_eviction_task::DiskUsageEvictionTaskConfig, virtual_file::io_engine};
+use crate::{l0_flush::L0FlushConfig, tenant::timeline::GetVectoredImpl};
 use crate::{tenant::config::TenantConf, virtual_file};
 use crate::{TENANT_HEATMAP_BASENAME, TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX};
 
@@ -50,6 +50,7 @@ pub mod defaults {
         DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_HTTP_LISTEN_PORT, DEFAULT_PG_LISTEN_ADDR,
         DEFAULT_PG_LISTEN_PORT,
     };
+    use pageserver_api::models::ImageCompressionAlgorithm;
     pub use storage_broker::DEFAULT_ENDPOINT as BROKER_DEFAULT_ENDPOINT;
 
     pub const DEFAULT_WAIT_LSN_TIMEOUT: &str = "60 s";
@@ -90,6 +91,9 @@ pub mod defaults {
 
     pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB
 
+    pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm =
+        ImageCompressionAlgorithm::Disabled;
+
     pub const DEFAULT_VALIDATE_VECTORED_GET: bool = true;
 
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
@@ -159,7 +163,7 @@ pub mod defaults {
 
 #ephemeral_bytes_per_memory_kb = {DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB}
 
-[remote_storage]
+#[remote_storage]
 
 "#
     );
@@ -285,12 +289,16 @@ pub struct PageServerConf {
 
     pub validate_vectored_get: bool,
 
+    pub image_compression: ImageCompressionAlgorithm,
+
     /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
     /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
     /// of ephemeral data.
     ///
     /// Setting this to zero disables limits on total ephemeral layer size.
     pub ephemeral_bytes_per_memory_kb: usize,
+
+    pub l0_flush: L0FlushConfig,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -395,7 +403,11 @@ struct PageServerConfigBuilder {
 
     validate_vectored_get: BuilderValue<bool>,
 
+    image_compression: BuilderValue<ImageCompressionAlgorithm>,
+
     ephemeral_bytes_per_memory_kb: BuilderValue<usize>,
+
+    l0_flush: BuilderValue<L0FlushConfig>,
 }
 
 impl PageServerConfigBuilder {
@@ -482,8 +494,10 @@ impl PageServerConfigBuilder {
             max_vectored_read_bytes: Set(MaxVectoredReadBytes(
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
+            image_compression: Set(DEFAULT_IMAGE_COMPRESSION),
             validate_vectored_get: Set(DEFAULT_VALIDATE_VECTORED_GET),
             ephemeral_bytes_per_memory_kb: Set(DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
+            l0_flush: Set(L0FlushConfig::default()),
         }
     }
 }
@@ -667,10 +681,18 @@ impl PageServerConfigBuilder {
         self.validate_vectored_get = BuilderValue::Set(value);
     }
 
+    pub fn get_image_compression(&mut self, value: ImageCompressionAlgorithm) {
+        self.image_compression = BuilderValue::Set(value);
+    }
+
     pub fn get_ephemeral_bytes_per_memory_kb(&mut self, value: usize) {
         self.ephemeral_bytes_per_memory_kb = BuilderValue::Set(value);
     }
 
+    pub fn l0_flush(&mut self, value: L0FlushConfig) {
+        self.l0_flush = BuilderValue::Set(value);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let default = Self::default_values();
 
@@ -727,7 +749,9 @@ impl PageServerConfigBuilder {
                 get_impl,
                 max_vectored_read_bytes,
                 validate_vectored_get,
+                image_compression,
                 ephemeral_bytes_per_memory_kb,
+                l0_flush,
             }
             CUSTOM LOGIC
             {
@@ -918,7 +942,7 @@ impl PageServerConf {
                 "http_auth_type" => builder.http_auth_type(parse_toml_from_str(key, item)?),
                 "pg_auth_type" => builder.pg_auth_type(parse_toml_from_str(key, item)?),
                 "remote_storage" => {
-                    builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
+                    builder.remote_storage_config(Some(RemoteStorageConfig::from_toml(item).context("remote_storage")?))
                 }
                 "tenant_config" => {
                     t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
@@ -946,7 +970,7 @@ impl PageServerConf {
                     builder.metric_collection_endpoint(Some(endpoint));
                 },
                 "metric_collection_bucket" => {
-                    builder.metric_collection_bucket(RemoteStorageConfig::from_toml(item)?)
+                    builder.metric_collection_bucket(Some(RemoteStorageConfig::from_toml(item)?))
                 }
                 "synthetic_size_calculation_interval" =>
                     builder.synthetic_size_calculation_interval(parse_toml_duration(key, item)?),
@@ -1004,9 +1028,15 @@ impl PageServerConf {
                 "validate_vectored_get" => {
                     builder.get_validate_vectored_get(parse_toml_bool("validate_vectored_get", item)?)
                 }
+                "image_compression" => {
+                    builder.get_image_compression(parse_toml_from_str("image_compression", item)?)
+                }
                 "ephemeral_bytes_per_memory_kb" => {
                     builder.get_ephemeral_bytes_per_memory_kb(parse_toml_u64("ephemeral_bytes_per_memory_kb", item)? as usize)
                 }
+                "l0_flush" => {
+                    builder.l0_flush(utils::toml_edit_ext::deserialize_item(item).context("l0_flush")?)
+                }
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -1088,8 +1118,10 @@ impl PageServerConf {
                 NonZeroUsize::new(defaults::DEFAULT_MAX_VECTORED_READ_BYTES)
                     .expect("Invalid default constant"),
             ),
+            image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
             validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
             ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+            l0_flush: L0FlushConfig::default(),
         }
     }
 }
@@ -1328,7 +1360,9 @@ background_task_maximum_delay = '334 s'
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                l0_flush: L0FlushConfig::default(),
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1401,7 +1435,9 @@ background_task_maximum_delay = '334 s'
                         .expect("Invalid default constant")
                 ),
                 validate_vectored_get: defaults::DEFAULT_VALIDATE_VECTORED_GET,
+                image_compression: defaults::DEFAULT_IMAGE_COMPRESSION,
                 ephemeral_bytes_per_memory_kb: defaults::DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB,
+                l0_flush: L0FlushConfig::default(),
             },
             "Should be able to parse all basic config values correctly"
         );
@@ -1681,6 +1717,19 @@ threshold = "20m"
         }
     }
 
+    #[test]
+    fn empty_remote_storage_is_error() {
+        let tempdir = tempdir().unwrap();
+        let (workdir, _) = prepare_fs(&tempdir).unwrap();
+        let input = r#"
+remote_storage = {}
+        "#;
+        let doc = toml_edit::Document::from_str(input).unwrap();
+        let err = PageServerConf::parse_and_validate(&doc, &workdir)
+            .expect_err("empty remote_storage field should fail, don't specify it if you want no remote_storage");
+        assert!(format!("{err}").contains("remote_storage"), "{err}");
+    }
+
     fn prepare_fs(tempdir: &Utf8TempDir) -> anyhow::Result<(Utf8PathBuf, Utf8PathBuf)> {
         let tempdir_path = tempdir.path();
 
diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs
index bf06c78e673f..d215fd2b7d2d 100644
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -190,7 +190,7 @@ where
                 }
             } else {
                 // If we failed validation, then do not apply any of the projected updates
-                warn!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
+                info!("Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}", tenant_lsn_state.generation);
                 metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
             }
         }
@@ -225,7 +225,7 @@ where
                     && (tenant.generation == *validated_generation);
 
                 if !this_list_valid {
-                    warn!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
                     metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                     mutated = true;
                 } else {
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 58ff6e3f83cc..5ba329f05ece 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -265,15 +265,19 @@ paths:
           type: string
           format: hex
     post:
-      description: Obtain lease for the given LSN
-      parameters:
-        - name: lsn
-          in: query
-          required: true
-          schema:
-            type: string
-            format: hex
-          description: A LSN to obtain the lease for
+      description: Obtains a lease for the given LSN.
+      requestBody:
+        content:
+          application/json:
+            schema:
+              type: object
+              required:
+               - lsn
+              properties:
+                lsn:
+                  description: A LSN to obtain the lease for.
+                  type: string
+                  format: hex
       responses:
         "200":
           description: OK
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 5ebd34a40690..6f8f3e6389d5 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,6 +10,7 @@ use std::time::Duration;
 
 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
+use futures::StreamExt;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
 use hyper::header;
@@ -22,6 +23,7 @@ use pageserver_api::models::ListAuxFilesRequest;
 use pageserver_api::models::LocationConfig;
 use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::LsnLease;
+use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigResponse;
@@ -42,18 +44,19 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
-use tenant_size_model::{SizeResult, StorageModel};
+use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
+use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::prometheus_metrics_handler;
 use utils::http::endpoint::request_span;
+use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
-use crate::metrics::{StorageTimeOperation, STORAGE_TIME_GLOBAL};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::{LocationConf, TenantConfOpt};
@@ -75,13 +78,12 @@ use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
-use crate::tenant::SpawnMode;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
-    StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
-    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
+    StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
+    TimelineInfo,
 };
 use utils::{
     auth::SwappableJwtAuth,
@@ -229,7 +231,7 @@ impl From<UpsertLocationError> for ApiError {
             BadRequest(e) => ApiError::BadRequest(e),
             Unavailable(_) => ApiError::ShuttingDown,
             e @ InProgress => ApiError::Conflict(format!("{e}")),
-            Flush(e) | Other(e) => ApiError::InternalServerError(e),
+            Flush(e) | InternalError(e) => ApiError::InternalServerError(e),
         }
     }
 }
@@ -408,6 +410,8 @@ async fn build_timeline_info_common(
 
     let walreceiver_status = timeline.walreceiver_status();
 
+    let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();
+
     let info = TimelineInfo {
         tenant_id: timeline.tenant_shard_id,
         timeline_id: timeline.timeline_id,
@@ -428,6 +432,8 @@ async fn build_timeline_info_common(
         directory_entries_counts: timeline.get_directory_metrics().to_vec(),
         current_physical_size,
         current_logical_size_non_incremental: None,
+        pitr_history_size,
+        within_ancestor_pitr,
         timeline_dir_layer_file_size_sum: None,
         wal_source_connstr,
         last_received_msg_lsn,
@@ -1193,10 +1199,15 @@ fn synthetic_size_html_response(
         timeline_map.insert(ti.timeline_id, index);
         timeline_ids.push(ti.timeline_id.to_string());
     }
-    let seg_to_branch: Vec<usize> = inputs
+    let seg_to_branch: Vec<(usize, SvgBranchKind)> = inputs
         .segments
         .iter()
-        .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
+        .map(|seg| {
+            (
+                *timeline_map.get(&seg.timeline_id).unwrap(),
+                seg.kind.into(),
+            )
+        })
         .collect();
 
     let svg =
@@ -1237,75 +1248,6 @@ pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>,
     Ok(response)
 }
 
-/// Helper for requests that may take a generation, which is mandatory
-/// when control_plane_api is set, but otherwise defaults to Generation::none()
-fn get_request_generation(state: &State, req_gen: Option<u32>) -> Result<Generation, ApiError> {
-    if state.conf.control_plane_api.is_some() {
-        req_gen
-            .map(Generation::new)
-            .ok_or(ApiError::BadRequest(anyhow!(
-                "generation attribute missing"
-            )))
-    } else {
-        // Legacy mode: all tenants operate with no generation
-        Ok(Generation::none())
-    }
-}
-
-async fn tenant_create_handler(
-    mut request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let request_data: TenantCreateRequest = json_request(&mut request).await?;
-    let target_tenant_id = request_data.new_tenant_id;
-    check_permission(&request, None)?;
-
-    let _timer = STORAGE_TIME_GLOBAL
-        .get_metric_with_label_values(&[StorageTimeOperation::CreateTenant.into()])
-        .expect("bug")
-        .start_timer();
-
-    let tenant_conf =
-        TenantConfOpt::try_from(&request_data.config).map_err(ApiError::BadRequest)?;
-
-    let state = get_state(&request);
-
-    let generation = get_request_generation(state, request_data.generation)?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
-    let location_conf =
-        LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters);
-
-    let new_tenant = state
-        .tenant_manager
-        .upsert_location(
-            target_tenant_id,
-            location_conf,
-            None,
-            SpawnMode::Create,
-            &ctx,
-        )
-        .await?;
-
-    let Some(new_tenant) = new_tenant else {
-        // This should never happen: indicates a bug in upsert_location
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "Upsert succeeded but didn't return tenant!"
-        )));
-    };
-    // We created the tenant. Existing API semantics are that the tenant
-    // is Active when this function returns.
-    new_tenant
-        .wait_to_become_active(ACTIVE_TENANT_TIMEOUT)
-        .await?;
-
-    json_response(
-        StatusCode::CREATED,
-        TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id),
-    )
-}
-
 async fn get_tenant_config_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -1367,7 +1309,7 @@ async fn update_tenant_config_handler(
 
     crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
         .await
-        .map_err(ApiError::InternalServerError)?;
+        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;
     tenant.set_new_tenant_config(new_tenant_conf);
 
     json_response(StatusCode::OK, ())
@@ -1598,15 +1540,13 @@ async fn handle_tenant_break(
 
 // Obtains an lsn lease on the given timeline.
 async fn lsn_lease_handler(
-    request: Request<Body>,
+    mut request: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let lsn: Lsn = parse_query_param(&request, "lsn")?
-        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
+    let lsn = json_request::<LsnLeaseRequest>(&mut request).await?.lsn;
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
 
@@ -2467,6 +2407,189 @@ async fn post_top_tenants(
     )
 }
 
+async fn put_tenant_timeline_import_basebackup(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let base_lsn: Lsn = must_parse_query_param(&request, "base_lsn")?;
+    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
+    let pg_version: u32 = must_parse_query_param(&request, "pg_version")?;
+
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let span = info_span!("import_basebackup", tenant_id=%tenant_id, timeline_id=%timeline_id, base_lsn=%base_lsn, end_lsn=%end_lsn, pg_version=%pg_version);
+    async move {
+        let state = get_state(&request);
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(TenantShardId::unsharded(tenant_id))?;
+
+        let broker_client = state.broker_client.clone();
+
+        let mut body = StreamReader::new(request.into_body().map(|res| {
+            res.map_err(|error| {
+                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
+            })
+        }));
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+        let timeline = tenant
+            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
+            .map_err(ApiError::InternalServerError)
+            .await?;
+
+        // TODO mark timeline as not ready until it reaches end_lsn.
+        // We might have some wal to import as well, and we should prevent compute
+        // from connecting before that and writing conflicting wal.
+        //
+        // This is not relevant for pageserver->pageserver migrations, since there's
+        // no wal to import. But should be fixed if we want to import from postgres.
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import basebackup provided via CopyData
+        info!("importing basebackup");
+
+        timeline
+            .import_basebackup_from_tar(tenant.clone(), &mut body, base_lsn, broker_client, &ctx)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        // Read the end of the tar archive.
+        read_tar_eof(body)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        // TODO check checksum
+        // Meanwhile you can verify client-side by taking fullbackup
+        // and checking that it matches in size with what was imported.
+        // It wouldn't work if base came from vanilla postgres though,
+        // since we discard some log files.
+
+        info!("done");
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(span)
+    .await
+}
+
+async fn put_tenant_timeline_import_wal(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let start_lsn: Lsn = must_parse_query_param(&request, "start_lsn")?;
+    let end_lsn: Lsn = must_parse_query_param(&request, "end_lsn")?;
+
+    check_permission(&request, Some(tenant_id))?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
+    let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
+    async move {
+        let state = get_state(&request);
+
+        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
+
+        let mut body = StreamReader::new(request.into_body().map(|res| {
+            res.map_err(|error| {
+                std::io::Error::new(std::io::ErrorKind::Other, anyhow::anyhow!(error))
+            })
+        }));
+
+        let last_record_lsn = timeline.get_last_record_lsn();
+        if last_record_lsn != start_lsn {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
+        }
+
+        // TODO leave clean state on error. For now you can use detach to clean
+        // up broken state from a failed import.
+
+        // Import wal provided via CopyData
+        info!("importing wal");
+        crate::import_datadir::import_wal_from_tar(&timeline, &mut body, start_lsn, end_lsn, &ctx).await.map_err(ApiError::InternalServerError)?;
+        info!("wal import complete");
+
+        // Read the end of the tar archive.
+        read_tar_eof(body).await.map_err(ApiError::InternalServerError)?;
+
+        // TODO Does it make sense to overshoot?
+        if timeline.get_last_record_lsn() < end_lsn {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}")));
+        }
+
+        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
+        // We only want to persist the data, and it doesn't matter if it's in the
+        // shape of deltas or images.
+        info!("flushing layers");
+        timeline.freeze_and_flush().await.map_err(|e| match e {
+            tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
+            other => ApiError::InternalServerError(anyhow::anyhow!(other)),
+        })?;
+
+        info!("done");
+
+        json_response(StatusCode::OK, ())
+    }.instrument(span).await
+}
+
+/// Read the end of a tar archive.
+///
+/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
+/// `tokio_tar` already read the first such block. Read the second all-zeros block,
+/// and check that there is no more data after the EOF marker.
+///
+/// 'tar' command can also write extra blocks of zeros, up to a record
+/// size, controlled by the --record-size argument. Ignore them too.
+async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow::Result<()> {
+    use tokio::io::AsyncReadExt;
+    let mut buf = [0u8; 512];
+
+    // Read the all-zeros block, and verify it
+    let mut total_bytes = 0;
+    while total_bytes < 512 {
+        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
+        total_bytes += nbytes;
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if total_bytes < 512 {
+        anyhow::bail!("incomplete or invalid tar EOF marker");
+    }
+    if !buf.iter().all(|&x| x == 0) {
+        anyhow::bail!("invalid tar EOF marker");
+    }
+
+    // Drain any extra zero-blocks after the EOF marker
+    let mut trailing_bytes = 0;
+    let mut seen_nonzero_bytes = false;
+    loop {
+        let nbytes = reader.read(&mut buf).await?;
+        trailing_bytes += nbytes;
+        if !buf.iter().all(|&x| x == 0) {
+            seen_nonzero_bytes = true;
+        }
+        if nbytes == 0 {
+            break;
+        }
+    }
+    if seen_nonzero_bytes {
+        anyhow::bail!("unexpected non-zero bytes after the tar archive");
+    }
+    if trailing_bytes % 512 != 0 {
+        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
+    }
+    Ok(())
+}
+
 /// Common functionality of all the HTTP API handlers.
 ///
 /// - Adds a tracing span to each request (by `request_span`)
@@ -2611,7 +2734,6 @@ pub fn make_router(
             api_handler(r, reload_auth_validation_keys_handler)
         })
         .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
-        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
         .get("/v1/tenant/:tenant_shard_id", |r| {
             api_handler(r, tenant_status)
         })
@@ -2762,5 +2884,13 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/perf_info",
             |r| testing_api_handler("perf_info", r, perf_info),
         )
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_basebackup",
+            |r| api_handler(r, put_tenant_timeline_import_basebackup),
+        )
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/import_wal",
+            |r| api_handler(r, put_tenant_timeline_import_wal),
+        )
         .any(handler_404))
 }
diff --git a/pageserver/src/l0_flush.rs b/pageserver/src/l0_flush.rs
new file mode 100644
index 000000000000..7fe8fedc6394
--- /dev/null
+++ b/pageserver/src/l0_flush.rs
@@ -0,0 +1,46 @@
+use std::{num::NonZeroUsize, sync::Arc};
+
+use crate::tenant::ephemeral_file;
+
+#[derive(Default, Debug, PartialEq, Eq, Clone, serde::Deserialize)]
+#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+pub enum L0FlushConfig {
+    #[default]
+    PageCached,
+    #[serde(rename_all = "snake_case")]
+    Direct { max_concurrency: NonZeroUsize },
+}
+
+#[derive(Clone)]
+pub struct L0FlushGlobalState(Arc<Inner>);
+
+pub(crate) enum Inner {
+    PageCached,
+    Direct { semaphore: tokio::sync::Semaphore },
+}
+
+impl L0FlushGlobalState {
+    pub fn new(config: L0FlushConfig) -> Self {
+        match config {
+            L0FlushConfig::PageCached => Self(Arc::new(Inner::PageCached)),
+            L0FlushConfig::Direct { max_concurrency } => {
+                let semaphore = tokio::sync::Semaphore::new(max_concurrency.get());
+                Self(Arc::new(Inner::Direct { semaphore }))
+            }
+        }
+    }
+
+    pub(crate) fn inner(&self) -> &Arc<Inner> {
+        &self.0
+    }
+}
+
+impl L0FlushConfig {
+    pub(crate) fn prewarm_on_write(&self) -> ephemeral_file::PrewarmPageCacheOnWrite {
+        use L0FlushConfig::*;
+        match self {
+            PageCached => ephemeral_file::PrewarmPageCacheOnWrite::Yes,
+            Direct { .. } => ephemeral_file::PrewarmPageCacheOnWrite::No,
+        }
+    }
+}
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 353f97264c5f..ac6b9b4f2a60 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -11,6 +11,7 @@ pub mod deletion_queue;
 pub mod disk_usage_eviction_task;
 pub mod http;
 pub mod import_datadir;
+pub mod l0_flush;
 pub use pageserver_api::keyspace;
 pub mod aux_file;
 pub mod metrics;
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ca697afcf640..e67fa656d02e 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -8,7 +8,7 @@ use metrics::{
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
-use strum::{EnumCount, IntoEnumIterator, VariantNames};
+use strum::{EnumCount, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use tracing::warn;
 use utils::id::TimelineId;
@@ -53,9 +53,6 @@ pub(crate) enum StorageTimeOperation {
 
     #[strum(serialize = "find gc cutoffs")]
     FindGcCutoffs,
-
-    #[strum(serialize = "create tenant")]
-    CreateTenant,
 }
 
 pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
@@ -467,6 +464,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_pitr_history_size",
+        "Data written since PITR cutoff on this timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+static TIMELINE_ARCHIVE_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_archive_size",
+        "Timeline's logical size if it is considered eligible for archival (outside PITR window), else zero",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
     register_int_gauge_vec!(
         "pageserver_standby_horizon",
@@ -479,7 +494,7 @@ static STANDBY_HORIZON: Lazy<IntGaugeVec> = Lazy::new(|| {
 static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_resident_physical_size",
-        "The size of the layer files present in the pageserver's filesystem.",
+        "The size of the layer files present in the pageserver's filesystem, for attached locations.",
         &["tenant_id", "shard_id", "timeline_id"]
     )
     .expect("failed to define a metric")
@@ -1079,21 +1094,12 @@ pub(crate) mod virtual_file_io_engine {
     });
 }
 
-#[derive(Debug)]
-struct GlobalAndPerTimelineHistogram {
-    global: Histogram,
-    per_tenant_timeline: Histogram,
-}
+struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
+    global_metric: &'a Histogram,
 
-impl GlobalAndPerTimelineHistogram {
-    fn observe(&self, value: f64) {
-        self.global.observe(value);
-        self.per_tenant_timeline.observe(value);
-    }
-}
+    // Optional because not all op types are tracked per-timeline
+    timeline_metric: Option<&'a Histogram>,
 
-struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    h: &'a GlobalAndPerTimelineHistogram,
     ctx: &'c RequestContext,
     start: std::time::Instant,
     op: SmgrQueryType,
@@ -1124,7 +1130,10 @@ impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
                 elapsed
             }
         };
-        self.h.observe(ex_throttled.as_secs_f64());
+        self.global_metric.observe(ex_throttled.as_secs_f64());
+        if let Some(timeline_metric) = self.timeline_metric {
+            timeline_metric.observe(ex_throttled.as_secs_f64());
+        }
     }
 }
 
@@ -1149,7 +1158,8 @@ pub enum SmgrQueryType {
 
 #[derive(Debug)]
 pub(crate) struct SmgrQueryTimePerTimeline {
-    metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
+    global_metrics: [Histogram; SmgrQueryType::COUNT],
+    per_timeline_getpage: Histogram,
 }
 
 static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
@@ -1227,27 +1237,32 @@ impl SmgrQueryTimePerTimeline {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
         let shard_slug = format!("{}", tenant_shard_id.shard_slug());
         let timeline_id = timeline_id.to_string();
-        let metrics = std::array::from_fn(|i| {
+        let global_metrics = std::array::from_fn(|i| {
             let op = SmgrQueryType::from_repr(i).unwrap();
-            let global = SMGR_QUERY_TIME_GLOBAL
+            SMGR_QUERY_TIME_GLOBAL
                 .get_metric_with_label_values(&[op.into()])
-                .unwrap();
-            let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
-                .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
-                .unwrap();
-            GlobalAndPerTimelineHistogram {
-                global,
-                per_tenant_timeline,
-            }
+                .unwrap()
         });
-        Self { metrics }
+
+        let per_timeline_getpage = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
+            .get_metric_with_label_values(&[
+                SmgrQueryType::GetPageAtLsn.into(),
+                &tenant_id,
+                &shard_slug,
+                &timeline_id,
+            ])
+            .unwrap();
+        Self {
+            global_metrics,
+            per_timeline_getpage,
+        }
     }
     pub(crate) fn start_timer<'c: 'a, 'a>(
         &'a self,
         op: SmgrQueryType,
         ctx: &'c RequestContext,
-    ) -> impl Drop + '_ {
-        let metric = &self.metrics[op as usize];
+    ) -> Option<impl Drop + '_> {
+        let global_metric = &self.global_metrics[op as usize];
         let start = Instant::now();
         match ctx.micros_spent_throttled.open() {
             Ok(()) => (),
@@ -1266,12 +1281,20 @@ impl SmgrQueryTimePerTimeline {
                 });
             }
         }
-        GlobalAndPerTimelineHistogramTimer {
-            h: metric,
+
+        let timeline_metric = if matches!(op, SmgrQueryType::GetPageAtLsn) {
+            Some(&self.per_timeline_getpage)
+        } else {
+            None
+        };
+
+        Some(GlobalAndPerTimelineHistogramTimer {
+            global_metric,
+            timeline_metric,
             ctx,
             start,
             op,
-        }
+        })
     }
 }
 
@@ -1318,17 +1341,9 @@ mod smgr_query_time_tests {
             let get_counts = || {
                 let global: u64 = ops
                     .iter()
-                    .map(|op| metrics.metrics[*op as usize].global.get_sample_count())
-                    .sum();
-                let per_tenant_timeline: u64 = ops
-                    .iter()
-                    .map(|op| {
-                        metrics.metrics[*op as usize]
-                            .per_tenant_timeline
-                            .get_sample_count()
-                    })
+                    .map(|op| metrics.global_metrics[*op as usize].get_sample_count())
                     .sum();
-                (global, per_tenant_timeline)
+                (global, metrics.per_timeline_getpage.get_sample_count())
             };
 
             let (pre_global, pre_per_tenant_timeline) = get_counts();
@@ -1339,7 +1354,12 @@ mod smgr_query_time_tests {
             drop(timer);
 
             let (post_global, post_per_tenant_timeline) = get_counts();
-            assert_eq!(post_per_tenant_timeline, 1);
+            if matches!(op, super::SmgrQueryType::GetPageAtLsn) {
+                // getpage ops are tracked per-timeline, others aren't
+                assert_eq!(post_per_tenant_timeline, 1);
+            } else {
+                assert_eq!(post_per_tenant_timeline, 0);
+            }
             assert!(post_global > pre_global);
         }
     }
@@ -1436,10 +1456,12 @@ impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
     }
 }
 
-pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
-    register_int_gauge_vec!(
-        "pageserver_live_connections",
-        "Number of live network connections",
+pub(crate) static LIVE_CONNECTIONS: Lazy<IntCounterPairVec> = Lazy::new(|| {
+    register_int_counter_pair_vec!(
+        "pageserver_live_connections_started",
+        "Number of network connections that we started handling",
+        "pageserver_live_connections_finished",
+        "Number of network connections that we finished handling",
         &["pageserver_connection_kind"]
     )
     .expect("failed to define a metric")
@@ -1450,10 +1472,7 @@ pub(crate) enum ComputeCommandKind {
     PageStreamV2,
     PageStream,
     Basebackup,
-    GetLastRecordRlsn,
     Fullbackup,
-    ImportBasebackup,
-    ImportWal,
     LeaseLsn,
     Show,
 }
@@ -1694,6 +1713,15 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| {
 }
 });
 
+pub(crate) static SECONDARY_RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_secondary_resident_physical_size",
+        "The size of the layer files present in the pageserver's filesystem, for secondary locations.",
+        &["tenant_id", "shard_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
     Upload,
@@ -2096,6 +2124,8 @@ pub(crate) struct TimelineMetrics {
     pub garbage_collect_histo: StorageTimeMetrics,
     pub find_gc_cutoffs_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
+    pub pitr_history_size: UIntGauge,
+    pub archival_size: UIntGauge,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
@@ -2169,6 +2199,15 @@ impl TimelineMetrics {
         let last_record_gauge = LAST_RECORD_LSN
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+
+        let pitr_history_size = PITR_HISTORY_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
+        let archival_size = TIMELINE_ARCHIVE_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
         let standby_horizon_gauge = STANDBY_HORIZON
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
@@ -2221,6 +2260,8 @@ impl TimelineMetrics {
             find_gc_cutoffs_histo,
             load_layer_map_histo,
             last_record_gauge,
+            pitr_history_size,
+            archival_size,
             standby_horizon_gauge,
             resident_physical_size_gauge,
             current_logical_size_gauge,
@@ -2278,6 +2319,10 @@ impl TimelineMetrics {
         if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
             let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
+
+        let _ = TIMELINE_ARCHIVE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = PITR_HISTORY_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+
         let _ = EVICTIONS.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = AUX_FILE_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = VALID_LSN_LEASE_COUNT.remove_label_values(&[tenant_id, shard_id, timeline_id]);
@@ -2311,14 +2356,12 @@ impl TimelineMetrics {
             let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
         }
 
-        for op in SmgrQueryType::iter() {
-            let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
-                op.into(),
-                tenant_id,
-                shard_id,
-                timeline_id,
-            ]);
-        }
+        let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
+            SmgrQueryType::GetPageAtLsn.into(),
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 6ea5f396d0a7..c10c2f2a0f9a 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -4,9 +4,7 @@
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
-use bytes::Bytes;
 use futures::stream::FuturesUnordered;
-use futures::Stream;
 use futures::StreamExt;
 use pageserver_api::key::Key;
 use pageserver_api::models::TenantState;
@@ -28,7 +26,6 @@ use std::borrow::Cow;
 use std::collections::HashMap;
 use std::io;
 use std::net::TcpListener;
-use std::pin::pin;
 use std::str;
 use std::str::FromStr;
 use std::sync::Arc;
@@ -37,7 +34,6 @@ use std::time::Instant;
 use std::time::SystemTime;
 use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::ConnectionId;
@@ -53,9 +49,8 @@ use crate::auth::check_permission;
 use crate::basebackup;
 use crate::basebackup::BasebackupError;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
-use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS_COUNT};
+use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
@@ -66,7 +61,6 @@ use crate::tenant::mgr::GetTenantError;
 use crate::tenant::mgr::ShardResolveResult;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::mgr::TenantManager;
-use crate::tenant::timeline::FlushLayerError;
 use crate::tenant::timeline::WaitLsnError;
 use crate::tenant::GetTimelineError;
 use crate::tenant::PageReconstructError;
@@ -82,56 +76,6 @@ use postgres_ffi::BLCKSZ;
 // is not yet in state [`TenantState::Active`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 
-/// Read the end of a tar archive.
-///
-/// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
-/// `tokio_tar` already read the first such block. Read the second all-zeros block,
-/// and check that there is no more data after the EOF marker.
-///
-/// 'tar' command can also write extra blocks of zeros, up to a record
-/// size, controlled by the --record-size argument. Ignore them too.
-async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()> {
-    use tokio::io::AsyncReadExt;
-    let mut buf = [0u8; 512];
-
-    // Read the all-zeros block, and verify it
-    let mut total_bytes = 0;
-    while total_bytes < 512 {
-        let nbytes = reader.read(&mut buf[total_bytes..]).await?;
-        total_bytes += nbytes;
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if total_bytes < 512 {
-        anyhow::bail!("incomplete or invalid tar EOF marker");
-    }
-    if !buf.iter().all(|&x| x == 0) {
-        anyhow::bail!("invalid tar EOF marker");
-    }
-
-    // Drain any extra zero-blocks after the EOF marker
-    let mut trailing_bytes = 0;
-    let mut seen_nonzero_bytes = false;
-    loop {
-        let nbytes = reader.read(&mut buf).await?;
-        trailing_bytes += nbytes;
-        if !buf.iter().all(|&x| x == 0) {
-            seen_nonzero_bytes = true;
-        }
-        if nbytes == 0 {
-            break;
-        }
-    }
-    if seen_nonzero_bytes {
-        anyhow::bail!("unexpected non-zero bytes after the tar archive");
-    }
-    if trailing_bytes % 512 != 0 {
-        anyhow::bail!("unexpected number of zeros ({trailing_bytes}), not divisible by tar block size (512 bytes), after the tar archive");
-    }
-    Ok(())
-}
-
 ///////////////////////////////////////////////////////////////////////////////
 
 ///
@@ -141,7 +85,6 @@ async fn read_tar_eof(mut reader: (impl AsyncRead + Unpin)) -> anyhow::Result<()
 ///
 pub async fn libpq_listener_main(
     tenant_manager: Arc<TenantManager>,
-    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     listener: TcpListener,
     auth_type: AuthType,
@@ -186,7 +129,6 @@ pub async fn libpq_listener_main(
                     false,
                     page_service_conn_main(
                         tenant_manager.clone(),
-                        broker_client.clone(),
                         local_auth,
                         socket,
                         auth_type,
@@ -209,20 +151,14 @@ pub async fn libpq_listener_main(
 #[instrument(skip_all, fields(peer_addr))]
 async fn page_service_conn_main(
     tenant_manager: Arc<TenantManager>,
-    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     socket: tokio::net::TcpStream,
     auth_type: AuthType,
     connection_ctx: RequestContext,
 ) -> anyhow::Result<()> {
-    // Immediately increment the gauge, then create a job to decrement it on task exit.
-    // One of the pros of `defer!` is that this will *most probably*
-    // get called, even in presence of panics.
-    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["page_service"]);
-    gauge.inc();
-    scopeguard::defer! {
-        gauge.dec();
-    }
+    let _guard = LIVE_CONNECTIONS
+        .with_label_values(&["page_service"])
+        .guard();
 
     socket
         .set_nodelay(true)
@@ -267,12 +203,11 @@ async fn page_service_conn_main(
     // and create a child per-query context when it invokes process_query.
     // But it's in a shared crate, so, we store connection_ctx inside PageServerHandler
     // and create the per-query context in process_query ourselves.
-    let mut conn_handler =
-        PageServerHandler::new(tenant_manager, broker_client, auth, connection_ctx);
+    let mut conn_handler = PageServerHandler::new(tenant_manager, auth, connection_ctx);
     let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
 
     match pgbackend
-        .run(&mut conn_handler, task_mgr::shutdown_watcher)
+        .run(&mut conn_handler, &task_mgr::shutdown_token())
         .await
     {
         Ok(()) => {
@@ -299,7 +234,6 @@ struct HandlerTimeline {
 }
 
 struct PageServerHandler {
-    broker_client: storage_broker::BrokerClientChannel,
     auth: Option<Arc<SwappableJwtAuth>>,
     claims: Option<Claims>,
 
@@ -391,13 +325,11 @@ impl From<WaitLsnError> for QueryError {
 impl PageServerHandler {
     pub fn new(
         tenant_manager: Arc<TenantManager>,
-        broker_client: storage_broker::BrokerClientChannel,
         auth: Option<Arc<SwappableJwtAuth>>,
         connection_ctx: RequestContext,
     ) -> Self {
         PageServerHandler {
             tenant_manager,
-            broker_client,
             auth,
             claims: None,
             connection_ctx,
@@ -480,73 +412,6 @@ impl PageServerHandler {
         )
     }
 
-    fn copyin_stream<'a, IO>(
-        &'a self,
-        pgb: &'a mut PostgresBackend<IO>,
-        cancel: &'a CancellationToken,
-    ) -> impl Stream<Item = io::Result<Bytes>> + 'a
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        async_stream::try_stream! {
-            loop {
-                let msg = tokio::select! {
-                    biased;
-
-                    _ = cancel.cancelled() => {
-                        // We were requested to shut down.
-                        let msg = "pageserver is shutting down";
-                        let _ = pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, None));
-                        Err(QueryError::Shutdown)
-                    }
-
-                    msg = pgb.read_message() => { msg.map_err(QueryError::from)}
-                };
-
-                match msg {
-                    Ok(Some(message)) => {
-                        let copy_data_bytes = match message {
-                            FeMessage::CopyData(bytes) => bytes,
-                            FeMessage::CopyDone => { break },
-                            FeMessage::Sync => continue,
-                            FeMessage::Terminate => {
-                                let msg = "client terminated connection with Terminate message during COPY";
-                                let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                                break;
-                            }
-                            m => {
-                                let msg = format!("unexpected message {m:?}");
-                                // error can't happen here, ErrorResponse serialization should be always ok
-                                pgb.write_message_noflush(&BeMessage::ErrorResponse(&msg, None)).map_err(|e| e.into_io_error())?;
-                                Err(io::Error::new(io::ErrorKind::Other, msg))?;
-                                break;
-                            }
-                        };
-
-                        yield copy_data_bytes;
-                    }
-                    Ok(None) => {
-                        let msg = "client closed connection during COPY";
-                        let query_error = QueryError::Disconnected(ConnectionError::Io(io::Error::new(io::ErrorKind::ConnectionReset, msg)));
-                        // error can't happen here, ErrorResponse serialization should be always ok
-                        pgb.write_message_noflush(&BeMessage::ErrorResponse(msg, Some(query_error.pg_error_code()))).map_err(|e| e.into_io_error())?;
-                        self.flush_cancellable(pgb, cancel).await.map_err(|e| io::Error::new(io::ErrorKind::Other, e.to_string()))?;
-                        Err(io::Error::new(io::ErrorKind::ConnectionReset, msg))?;
-                    }
-                    Err(QueryError::Disconnected(ConnectionError::Io(io_error))) => {
-                        Err(io_error)?;
-                    }
-                    Err(other) => {
-                        Err(io::Error::new(io::ErrorKind::Other, other.to_string()))?;
-                    }
-                };
-            }
-        }
-    }
-
     #[instrument(skip_all)]
     async fn handle_pagerequests<IO>(
         &mut self,
@@ -718,128 +583,6 @@ impl PageServerHandler {
         Ok(())
     }
 
-    #[allow(clippy::too_many_arguments)]
-    #[instrument(skip_all, fields(%base_lsn, end_lsn=%_end_lsn, %pg_version))]
-    async fn handle_import_basebackup<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        base_lsn: Lsn,
-        _end_lsn: Lsn,
-        pg_version: u32,
-        ctx: RequestContext,
-    ) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
-
-        // Create empty timeline
-        info!("creating new timeline");
-        let tenant = self
-            .get_active_tenant_with_timeout(tenant_id, ShardSelector::Zero, ACTIVE_TENANT_TIMEOUT)
-            .await?;
-        let timeline = tenant
-            .create_empty_timeline(timeline_id, base_lsn, pg_version, &ctx)
-            .await?;
-
-        // TODO mark timeline as not ready until it reaches end_lsn.
-        // We might have some wal to import as well, and we should prevent compute
-        // from connecting before that and writing conflicting wal.
-        //
-        // This is not relevant for pageserver->pageserver migrations, since there's
-        // no wal to import. But should be fixed if we want to import from postgres.
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import basebackup provided via CopyData
-        info!("importing basebackup");
-        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &tenant.cancel).await?;
-
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &tenant.cancel)));
-        timeline
-            .import_basebackup_from_tar(
-                tenant.clone(),
-                &mut copyin_reader,
-                base_lsn,
-                self.broker_client.clone(),
-                &ctx,
-            )
-            .await?;
-
-        // Read the end of the tar archive.
-        read_tar_eof(copyin_reader).await?;
-
-        // TODO check checksum
-        // Meanwhile you can verify client-side by taking fullbackup
-        // and checking that it matches in size with what was imported.
-        // It wouldn't work if base came from vanilla postgres though,
-        // since we discard some log files.
-
-        info!("done");
-        Ok(())
-    }
-
-    #[instrument(skip_all, fields(shard_id, %start_lsn, %end_lsn))]
-    async fn handle_import_wal<IO>(
-        &self,
-        pgb: &mut PostgresBackend<IO>,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        start_lsn: Lsn,
-        end_lsn: Lsn,
-        ctx: RequestContext,
-    ) -> Result<(), QueryError>
-    where
-        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
-    {
-        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-            .await?;
-        let last_record_lsn = timeline.get_last_record_lsn();
-        if last_record_lsn != start_lsn {
-            return Err(QueryError::Other(
-                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
-            );
-        }
-
-        // TODO leave clean state on error. For now you can use detach to clean
-        // up broken state from a failed import.
-
-        // Import wal provided via CopyData
-        info!("importing wal");
-        pgb.write_message_noflush(&BeMessage::CopyInResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
-        let mut copyin_reader = pin!(StreamReader::new(self.copyin_stream(pgb, &timeline.cancel)));
-        import_wal_from_tar(&timeline, &mut copyin_reader, start_lsn, end_lsn, &ctx).await?;
-        info!("wal import complete");
-
-        // Read the end of the tar archive.
-        read_tar_eof(copyin_reader).await?;
-
-        // TODO Does it make sense to overshoot?
-        if timeline.get_last_record_lsn() < end_lsn {
-            return Err(QueryError::Other(
-                anyhow::anyhow!("Cannot import WAL from Lsn {start_lsn} because timeline does not start from the same lsn: {last_record_lsn}"))
-            );
-        }
-
-        // Flush data to disk, then upload to s3. No need for a forced checkpoint.
-        // We only want to persist the data, and it doesn't matter if it's in the
-        // shape of deltas or images.
-        info!("flushing layers");
-        timeline.freeze_and_flush().await.map_err(|e| match e {
-            FlushLayerError::Cancelled => QueryError::Shutdown,
-            other => QueryError::Other(other.into()),
-        })?;
-
-        info!("done");
-        Ok(())
-    }
-
     /// Helper function to handle the LSN from client request.
     ///
     /// Each GetPage (and Exists and Nblocks) request includes information about
@@ -1656,53 +1399,6 @@ where
             metric_recording.observe(&res);
             res?;
         }
-        // return pair of prev_lsn and last_lsn
-        else if let Some(params) = parts.strip_prefix(&["get_last_record_rlsn"]) {
-            if params.len() != 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for get_last_record_rlsn command"
-                )));
-            }
-
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::GetLastRecordRlsn)
-                .inc();
-
-            async {
-                let timeline = self
-                    .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
-                    .await?;
-
-                let end_of_timeline = timeline.get_last_record_rlsn();
-
-                pgb.write_message_noflush(&BeMessage::RowDescription(&[
-                    RowDescriptor::text_col(b"prev_lsn"),
-                    RowDescriptor::text_col(b"last_lsn"),
-                ]))?
-                .write_message_noflush(&BeMessage::DataRow(&[
-                    Some(end_of_timeline.prev.to_string().as_bytes()),
-                    Some(end_of_timeline.last.to_string().as_bytes()),
-                ]))?
-                .write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                anyhow::Ok(())
-            }
-            .instrument(info_span!(
-                "handle_get_last_record_lsn",
-                shard_id = tracing::field::Empty
-            ))
-            .await?;
-        }
         // same as basebackup, but result includes relational data as well
         else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
             if params.len() < 2 {
@@ -1757,109 +1453,6 @@ where
             )
             .await?;
             pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("import basebackup ") {
-            // Import the `base` section (everything but the wal) of a basebackup.
-            // Assumes the tenant already exists on this pageserver.
-            //
-            // Files are scheduled to be persisted to remote storage, and the
-            // caller should poll the http api to check when that is done.
-            //
-            // Example import command:
-            // 1. Get start/end LSN from backup_manifest file
-            // 2. Run:
-            // cat my_backup/base.tar | psql -h $PAGESERVER \
-            //     -c "import basebackup $TENANT $TIMELINE $START_LSN $END_LSN $PG_VERSION"
-            let params = &parts[2..];
-            if params.len() != 5 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for import basebackup command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-            let base_lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-            let end_lsn = Lsn::from_str(params[3])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
-            let pg_version = u32::from_str(params[4])
-                .with_context(|| format!("Failed to parse pg_version from {}", params[4]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::ImportBasebackup)
-                .inc();
-
-            match self
-                .handle_import_basebackup(
-                    pgb,
-                    tenant_id,
-                    timeline_id,
-                    base_lsn,
-                    end_lsn,
-                    pg_version,
-                    ctx,
-                )
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error importing base backup between {base_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
-        } else if query_string.starts_with("import wal ") {
-            // Import the `pg_wal` section of a basebackup.
-            //
-            // Files are scheduled to be persisted to remote storage, and the
-            // caller should poll the http api to check when that is done.
-            let params = &parts[2..];
-            if params.len() != 4 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for import wal command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-            let start_lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-            let end_lsn = Lsn::from_str(params[3])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[3]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::ImportWal)
-                .inc();
-
-            match self
-                .handle_import_wal(pgb, tenant_id, timeline_id, start_lsn, end_lsn, ctx)
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error importing WAL between {start_lsn} and {end_lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
         } else if query_string.to_ascii_lowercase().starts_with("set ") {
             // important because psycopg2 executes "SET datestyle TO 'ISO'"
             // on connect
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 25d00d6dfd0d..8a6cfea92b3b 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -854,13 +854,14 @@ impl Timeline {
         result.add_key(DBDIR_KEY);
 
         // Fetch list of database dirs and iterate them
-        let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
-        let dbdir = DbDirectory::des(&buf)?;
+        let dbdir = self.list_dbdirs(lsn, ctx).await?;
+        let mut dbs: Vec<((Oid, Oid), bool)> = dbdir.into_iter().collect();
 
-        let mut dbs: Vec<(Oid, Oid)> = dbdir.dbdirs.keys().cloned().collect();
-        dbs.sort_unstable();
-        for (spcnode, dbnode) in dbs {
-            result.add_key(relmap_file_key(spcnode, dbnode));
+        dbs.sort_unstable_by(|(k_a, _), (k_b, _)| k_a.cmp(k_b));
+        for ((spcnode, dbnode), has_relmap_file) in dbs {
+            if has_relmap_file {
+                result.add_key(relmap_file_key(spcnode, dbnode));
+            }
             result.add_key(rel_dir_to_key(spcnode, dbnode));
 
             let mut rels: Vec<RelTag> = self
@@ -919,6 +920,9 @@ impl Timeline {
             result.add_key(AUX_FILES_KEY);
         }
 
+        // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
+        // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
+        // and the keys will not be garbage-colllected.
         #[cfg(test)]
         {
             let guard = self.extra_test_dense_keyspace.load();
@@ -927,13 +931,48 @@ impl Timeline {
             }
         }
 
-        Ok((
-            result.to_keyspace(),
-            /* AUX sparse key space */
-            SparseKeySpace(KeySpace {
-                ranges: vec![repl_origin_key_range(), Key::metadata_aux_key_range()],
-            }),
-        ))
+        let dense_keyspace = result.to_keyspace();
+        let sparse_keyspace = SparseKeySpace(KeySpace {
+            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
+        });
+
+        if cfg!(debug_assertions) {
+            // Verify if the sparse keyspaces are ordered and non-overlapping.
+
+            // We do not use KeySpaceAccum for sparse_keyspace because we want to ensure each
+            // category of sparse keys are split into their own image/delta files. If there
+            // are overlapping keyspaces, they will be automatically merged by keyspace accum,
+            // and we want the developer to keep the keyspaces separated.
+
+            let ranges = &sparse_keyspace.0.ranges;
+
+            // TODO: use a single overlaps_with across the codebase
+            fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+                !(a.end <= b.start || b.end <= a.start)
+            }
+            for i in 0..ranges.len() {
+                for j in 0..i {
+                    if overlaps_with(&ranges[i], &ranges[j]) {
+                        panic!(
+                            "overlapping sparse keyspace: {}..{} and {}..{}",
+                            ranges[i].start, ranges[i].end, ranges[j].start, ranges[j].end
+                        );
+                    }
+                }
+            }
+            for i in 1..ranges.len() {
+                assert!(
+                    ranges[i - 1].end <= ranges[i].start,
+                    "unordered sparse keyspace: {}..{} and {}..{}",
+                    ranges[i - 1].start,
+                    ranges[i - 1].end,
+                    ranges[i].start,
+                    ranges[i].end
+                );
+            }
+        }
+
+        Ok((dense_keyspace, sparse_keyspace))
     }
 
     /// Get cached size of relation if it not updated after specified LSN
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 45e542a3367c..eef8dc104c69 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -73,6 +73,7 @@ use crate::deletion_queue::DeletionQueueClient;
 use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
+use crate::l0_flush::L0FlushGlobalState;
 use crate::metrics::TENANT;
 use crate::metrics::{
     remove_tenant_metrics, BROKEN_TENANTS_SET, TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
@@ -88,6 +89,7 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
+use crate::walredo;
 use crate::InitializationOrder;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
@@ -165,6 +167,7 @@ pub struct TenantSharedResources {
     pub broker_client: storage_broker::BrokerClientChannel,
     pub remote_storage: GenericRemoteStorage,
     pub deletion_queue_client: DeletionQueueClient,
+    pub l0_flush_global_state: L0FlushGlobalState,
 }
 
 /// A [`Tenant`] is really an _attached_ tenant.  The configuration
@@ -212,8 +215,6 @@ pub(crate) enum SpawnMode {
     Eager,
     /// Lazy activation in the background, with the option to skip the queue if the need comes up
     Lazy,
-    /// Tenant has been created during the lifetime of this process
-    Create,
 }
 
 ///
@@ -295,6 +296,8 @@ pub struct Tenant {
 
     /// An ongoing timeline detach must be checked during attempts to GC or compact a timeline.
     ongoing_timeline_detach: std::sync::Mutex<Option<(TimelineId, utils::completion::Barrier)>>,
+
+    l0_flush_global_state: L0FlushGlobalState,
 }
 
 impl std::fmt::Debug for Tenant {
@@ -323,6 +326,16 @@ impl From<harness::TestRedoManager> for WalRedoManager {
 }
 
 impl WalRedoManager {
+    pub(crate) async fn shutdown(&self) {
+        match self {
+            Self::Prod(mgr) => mgr.shutdown().await,
+            #[cfg(test)]
+            Self::Test(_) => {
+                // Not applicable to test redo manager
+            }
+        }
+    }
+
     pub(crate) fn maybe_quiesce(&self, idle_timeout: Duration) {
         match self {
             Self::Prod(mgr) => mgr.maybe_quiesce(idle_timeout),
@@ -343,7 +356,7 @@ impl WalRedoManager {
         base_img: Option<(Lsn, bytes::Bytes)>,
         records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
         pg_version: u32,
-    ) -> anyhow::Result<bytes::Bytes> {
+    ) -> Result<bytes::Bytes, walredo::Error> {
         match self {
             Self::Prod(mgr) => {
                 mgr.request_redo(key, lsn, base_img, records, pg_version)
@@ -520,6 +533,15 @@ impl From<PageReconstructError> for GcError {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum LoadConfigError {
+    #[error("TOML deserialization error: '{0}'")]
+    DeserializeToml(#[from] toml_edit::de::Error),
+
+    #[error("Config not found at {0}")]
+    NotFound(Utf8PathBuf),
+}
+
 impl Tenant {
     /// Yet another helper for timeline initialization.
     ///
@@ -658,6 +680,7 @@ impl Tenant {
             broker_client,
             remote_storage,
             deletion_queue_client,
+            l0_flush_global_state,
         } = resources;
 
         let attach_mode = attached_conf.location.attach_mode;
@@ -672,6 +695,7 @@ impl Tenant {
             tenant_shard_id,
             remote_storage.clone(),
             deletion_queue_client,
+            l0_flush_global_state,
         ));
 
         // The attach task will carry a GateGuard, so that shutdown() reliably waits for it to drop out if
@@ -797,9 +821,6 @@ impl Tenant {
                 };
 
                 let preload = match &mode {
-                    SpawnMode::Create => {
-                        None
-                    },
                     SpawnMode::Eager | SpawnMode::Lazy => {
                         let _preload_timer = TENANT.preload.start_timer();
                         let res = tenant_clone
@@ -821,11 +842,8 @@ impl Tenant {
 
                 // We will time the duration of the attach phase unless this is a creation (attach will do no work)
                 let attached = {
-                    let _attach_timer = match mode {
-                        SpawnMode::Create => None,
-                        SpawnMode::Eager | SpawnMode::Lazy => Some(TENANT.attach.start_timer()),
-                    };
-                    tenant_clone.attach(preload, mode, &ctx).await
+                    let _attach_timer = Some(TENANT.attach.start_timer());
+                    tenant_clone.attach(preload, &ctx).await
                 };
 
                 match attached {
@@ -901,21 +919,14 @@ impl Tenant {
     async fn attach(
         self: &Arc<Tenant>,
         preload: Option<TenantPreload>,
-        mode: SpawnMode,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         span::debug_assert_current_span_has_tenant_id();
 
         failpoint_support::sleep_millis_async!("before-attaching-tenant");
 
-        let preload = match (preload, mode) {
-            (Some(p), _) => p,
-            (None, SpawnMode::Create) => TenantPreload {
-                timelines: HashMap::new(),
-            },
-            (None, _) => {
-                anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
-            }
+        let Some(preload) = preload else {
+            anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
         };
 
         let mut timelines_to_resume_deletions = vec![];
@@ -984,6 +995,7 @@ impl Tenant {
                 TimelineResources {
                     remote_client,
                     timeline_get_throttle: self.timeline_get_throttle.clone(),
+                    l0_flush_global_state: self.l0_flush_global_state.clone(),
                 },
                 ctx,
             )
@@ -1353,7 +1365,7 @@ impl Tenant {
         initdb_lsn: Lsn,
         pg_version: u32,
         ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
@@ -1804,9 +1816,15 @@ impl Tenant {
         // If we're still attaching, fire the cancellation token early to drop out: this
         // will prevent us flushing, but ensures timely shutdown if some I/O during attach
         // is very slow.
-        if matches!(self.current_state(), TenantState::Attaching) {
+        let shutdown_mode = if matches!(self.current_state(), TenantState::Attaching) {
             self.cancel.cancel();
-        }
+
+            // Having fired our cancellation token, do not try and flush timelines: their cancellation tokens
+            // are children of ours, so their flush loops will have shut down already
+            timeline::ShutdownMode::Hard
+        } else {
+            shutdown_mode
+        };
 
         match self.set_stopping(shutdown_progress, false, false).await {
             Ok(()) => {}
@@ -1853,6 +1871,10 @@ impl Tenant {
         tracing::debug!("Waiting for tasks...");
         task_mgr::shutdown_tasks(None, Some(self.tenant_shard_id), None).await;
 
+        if let Some(walredo_mgr) = self.walredo_mgr.as_ref() {
+            walredo_mgr.shutdown().await;
+        }
+
         // Wait for any in-flight operations to complete
         self.gate.close().await;
 
@@ -2469,6 +2491,7 @@ impl Tenant {
         tenant_shard_id: TenantShardId,
         remote_storage: GenericRemoteStorage,
         deletion_queue_client: DeletionQueueClient,
+        l0_flush_global_state: L0FlushGlobalState,
     ) -> Tenant {
         debug_assert!(
             !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
@@ -2556,6 +2579,7 @@ impl Tenant {
             )),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
+            l0_flush_global_state,
         }
     }
 
@@ -2563,36 +2587,35 @@ impl Tenant {
     pub(super) fn load_tenant_config(
         conf: &'static PageServerConf,
         tenant_shard_id: &TenantShardId,
-    ) -> anyhow::Result<LocationConf> {
+    ) -> Result<LocationConf, LoadConfigError> {
         let config_path = conf.tenant_location_config_path(tenant_shard_id);
 
-        if config_path.exists() {
-            // New-style config takes precedence
-            let deserialized = Self::read_config(&config_path)?;
-            Ok(toml_edit::de::from_document::<LocationConf>(deserialized)?)
-        } else {
-            // The config should almost always exist for a tenant directory:
-            //  - When attaching a tenant, the config is the first thing we write
-            //  - When detaching a tenant, we atomically move the directory to a tmp location
-            //    before deleting contents.
-            //
-            // The very rare edge case that can result in a missing config is if we crash during attach
-            // between creating directory and writing config.  Callers should handle that as if the
-            // directory didn't exist.
-            anyhow::bail!("tenant config not found in {}", config_path);
-        }
-    }
-
-    fn read_config(path: &Utf8Path) -> anyhow::Result<toml_edit::Document> {
-        info!("loading tenant configuration from {path}");
+        info!("loading tenant configuration from {config_path}");
 
         // load and parse file
-        let config = fs::read_to_string(path)
-            .with_context(|| format!("Failed to load config from path '{path}'"))?;
+        let config = fs::read_to_string(&config_path).map_err(|e| {
+            match e.kind() {
+                std::io::ErrorKind::NotFound => {
+                    // The config should almost always exist for a tenant directory:
+                    //  - When attaching a tenant, the config is the first thing we write
+                    //  - When detaching a tenant, we atomically move the directory to a tmp location
+                    //    before deleting contents.
+                    //
+                    // The very rare edge case that can result in a missing config is if we crash during attach
+                    // between creating directory and writing config.  Callers should handle that as if the
+                    // directory didn't exist.
+
+                    LoadConfigError::NotFound(config_path)
+                }
+                _ => {
+                    // No IO errors except NotFound are acceptable here: other kinds of error indicate local storage or permissions issues
+                    // that we cannot cleanly recover
+                    crate::virtual_file::on_fatal_io_error(&e, "Reading tenant config file")
+                }
+            }
+        })?;
 
-        config
-            .parse::<toml_edit::Document>()
-            .with_context(|| format!("Failed to parse config from file '{path}' as toml file"))
+        Ok(toml_edit::de::from_str::<LocationConf>(&config)?)
     }
 
     #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
@@ -2600,7 +2623,7 @@ impl Tenant {
         conf: &'static PageServerConf,
         tenant_shard_id: &TenantShardId,
         location_conf: &LocationConf,
-    ) -> anyhow::Result<()> {
+    ) -> std::io::Result<()> {
         let config_path = conf.tenant_location_config_path(tenant_shard_id);
 
         Self::persist_tenant_config_at(tenant_shard_id, &config_path, location_conf).await
@@ -2611,7 +2634,7 @@ impl Tenant {
         tenant_shard_id: &TenantShardId,
         config_path: &Utf8Path,
         location_conf: &LocationConf,
-    ) -> anyhow::Result<()> {
+    ) -> std::io::Result<()> {
         debug!("persisting tenantconf to {config_path}");
 
         let mut conf_content = r#"# This file contains a specific per-tenant's config.
@@ -2620,22 +2643,20 @@ impl Tenant {
         .to_string();
 
         fail::fail_point!("tenant-config-before-write", |_| {
-            anyhow::bail!("tenant-config-before-write");
+            Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "tenant-config-before-write",
+            ))
         });
 
         // Convert the config to a toml file.
-        conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;
+        conf_content +=
+            &toml_edit::ser::to_string_pretty(&location_conf).expect("Config serialization failed");
 
         let temp_path = path_with_suffix_extension(config_path, TEMP_FILE_SUFFIX);
 
-        let tenant_shard_id = *tenant_shard_id;
-        let config_path = config_path.to_owned();
         let conf_content = conf_content.into_bytes();
-        VirtualFile::crashsafe_overwrite(config_path.clone(), temp_path, conf_content)
-            .await
-            .with_context(|| format!("write tenant {tenant_shard_id} config to {config_path}"))?;
-
-        Ok(())
+        VirtualFile::crashsafe_overwrite(config_path.to_owned(), temp_path, conf_content).await
     }
 
     //
@@ -2853,6 +2874,7 @@ impl Tenant {
             {
                 let mut target = timeline.gc_info.write().unwrap();
 
+                // Cull any expired leases
                 let now = SystemTime::now();
                 target.leases.retain(|_, lease| !lease.is_expired(&now));
 
@@ -2861,6 +2883,31 @@ impl Tenant {
                     .valid_lsn_lease_count_gauge
                     .set(target.leases.len() as u64);
 
+                // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
+                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
+                    if let Some(ancestor_gc_cutoffs) = gc_cutoffs.get(&ancestor_id) {
+                        target.within_ancestor_pitr =
+                            timeline.get_ancestor_lsn() >= ancestor_gc_cutoffs.pitr;
+                    }
+                }
+
+                // Update metrics that depend on GC state
+                timeline
+                    .metrics
+                    .archival_size
+                    .set(if target.within_ancestor_pitr {
+                        timeline.metrics.current_logical_size_gauge.get()
+                    } else {
+                        0
+                    });
+                timeline.metrics.pitr_history_size.set(
+                    timeline
+                        .get_last_record_lsn()
+                        .checked_sub(target.cutoffs.pitr)
+                        .unwrap_or(Lsn(0))
+                        .0,
+                );
+
                 match gc_cutoffs.remove(&timeline.timeline_id) {
                     Some(cutoffs) => {
                         target.retain_lsns = branchpoints;
@@ -2912,7 +2959,7 @@ impl Tenant {
         dst_id: TimelineId,
         ancestor_lsn: Option<Lsn>,
         ctx: &RequestContext,
-        delta_layer_desc: Vec<Vec<(pageserver_api::key::Key, Lsn, crate::repository::Value)>>,
+        delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
     ) -> anyhow::Result<Arc<Timeline>> {
@@ -3296,6 +3343,7 @@ impl Tenant {
         TimelineResources {
             remote_client,
             timeline_get_throttle: self.timeline_get_throttle.clone(),
+            l0_flush_global_state: self.l0_flush_global_state.clone(),
         }
     }
 
@@ -3632,6 +3680,7 @@ pub(crate) mod harness {
     use utils::logging;
 
     use crate::deletion_queue::mock::MockDeletionQueue;
+    use crate::l0_flush::L0FlushConfig;
     use crate::walredo::apply_neon;
     use crate::{repository::Key, walrecord::NeonWalRecord};
 
@@ -3821,12 +3870,14 @@ pub(crate) mod harness {
                 self.tenant_shard_id,
                 self.remote_storage.clone(),
                 self.deletion_queue.new_client(),
+                // TODO: ideally we should run all unit tests with both configs
+                L0FlushGlobalState::new(L0FlushConfig::default()),
             ));
 
             let preload = tenant
                 .preload(&self.remote_storage, CancellationToken::new())
                 .await?;
-            tenant.attach(Some(preload), SpawnMode::Eager, ctx).await?;
+            tenant.attach(Some(preload), ctx).await?;
 
             tenant.state.send_replace(TenantState::Active);
             for timeline in tenant.timelines.lock().unwrap().values() {
@@ -3854,7 +3905,7 @@ pub(crate) mod harness {
             base_img: Option<(Lsn, Bytes)>,
             records: Vec<(Lsn, NeonWalRecord)>,
             _pg_version: u32,
-        ) -> anyhow::Result<Bytes> {
+        ) -> Result<Bytes, walredo::Error> {
             let records_neon = records.iter().all(|r| apply_neon::can_apply_in_neon(&r.1));
             if records_neon {
                 // For Neon wal records, we can decode without spawning postgres, so do so.
@@ -3908,7 +3959,7 @@ mod tests {
     use storage_layer::PersistentLayerKey;
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
-    use timeline::GcInfo;
+    use timeline::{DeltaLayerTestDesc, GcInfo};
     use utils::bin_ser::BeSer;
     use utils::id::TenantId;
 
@@ -6204,27 +6255,6 @@ mod tests {
             .await
             .unwrap();
 
-        async fn get_vectored_impl_wrapper(
-            tline: &Arc<Timeline>,
-            key: Key,
-            lsn: Lsn,
-            ctx: &RequestContext,
-        ) -> Result<Option<Bytes>, GetVectoredError> {
-            let mut reconstruct_state = ValuesReconstructState::new();
-            let mut res = tline
-                .get_vectored_impl(
-                    KeySpace::single(key..key.next()),
-                    lsn,
-                    &mut reconstruct_state,
-                    ctx,
-                )
-                .await?;
-            Ok(res.pop_last().map(|(k, v)| {
-                assert_eq!(k, key);
-                v.unwrap()
-            }))
-        }
-
         let lsn = Lsn(0x30);
 
         // test vectored get on parent timeline
@@ -6264,7 +6294,7 @@ mod tests {
 
     #[tokio::test]
     async fn test_vectored_missing_metadata_key_reads() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_vectored_missing_data_key_reads")?;
+        let harness = TenantHarness::create("test_vectored_missing_metadata_key_reads")?;
         let (tenant, ctx) = harness.load().await;
 
         let base_key = Key::from_hex("620000000033333333444444445500000000").unwrap();
@@ -6300,27 +6330,6 @@ mod tests {
             .await
             .unwrap();
 
-        async fn get_vectored_impl_wrapper(
-            tline: &Arc<Timeline>,
-            key: Key,
-            lsn: Lsn,
-            ctx: &RequestContext,
-        ) -> Result<Option<Bytes>, GetVectoredError> {
-            let mut reconstruct_state = ValuesReconstructState::new();
-            let mut res = tline
-                .get_vectored_impl(
-                    KeySpace::single(key..key.next()),
-                    lsn,
-                    &mut reconstruct_state,
-                    ctx,
-                )
-                .await?;
-            Ok(res.pop_last().map(|(k, v)| {
-                assert_eq!(k, key);
-                v.unwrap()
-            }))
-        }
-
         let lsn = Lsn(0x30);
 
         // test vectored get on parent timeline
@@ -6396,9 +6405,18 @@ mod tests {
                 &ctx,
                 // delta layers
                 vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x10)..Lsn(0x20),
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
                 ],
                 // image layers
                 vec![
@@ -6464,17 +6482,29 @@ mod tests {
                 &ctx,
                 // delta layers
                 vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![
-                        (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
-                        (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
-                    ],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x10)..Lsn(0x20),
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x30)..Lsn(0x40),
+                        vec![
+                            (key0, Lsn(0x30), Value::Image(test_img("metadata key 0"))),
+                            (key3, Lsn(0x30), Value::Image(test_img("metadata key 3"))),
+                        ],
+                    ),
                 ],
                 // image layers
                 vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
-                Lsn(0x30),
+                Lsn(0x40),
             )
             .await
             .unwrap();
@@ -6497,7 +6527,7 @@ mod tests {
 
         // Image layers are created at last_record_lsn
         let images = tline
-            .inspect_image_layers(Lsn(0x30), &ctx)
+            .inspect_image_layers(Lsn(0x40), &ctx)
             .await
             .unwrap()
             .into_iter()
@@ -6523,9 +6553,18 @@ mod tests {
                 &ctx,
                 // delta layers
                 vec![
-                    vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
-                    vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
-                    vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x10)..Lsn(0x20),
+                        vec![(key2, Lsn(0x10), Value::Image(test_img("metadata key 2")))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key1, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(
+                        Lsn(0x20)..Lsn(0x30),
+                        vec![(key2, Lsn(0x20), Value::Image(Bytes::new()))],
+                    ),
                 ],
                 // image layers
                 vec![(Lsn(0x10), vec![(key1, test_img("metadata key 1"))])],
@@ -6573,15 +6612,21 @@ mod tests {
             key
         }
 
-        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        // We create
+        // - one bottom-most image layer,
+        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
+        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
+        // - a delta layer D3 above the horizon.
         //
-        //  | D1 |                       | D3 |
+        //                             | D3 |
+        //  | D1 |
         // -|    |-- gc horizon -----------------
         //  |    |                | D2 |
         // --------- img layer ------------------
         //
         // What we should expact from this compaction is:
-        //  | Part of D1 |               | D3 |
+        //                             | D3 |
+        //  | Part of D1 |
         // --------- img layer with D1+D2 at GC horizon------------------
 
         // img layer at 0x10
@@ -6621,13 +6666,13 @@ mod tests {
         let delta3 = vec![
             (
                 get_key(8),
-                Lsn(0x40),
-                Value::Image(Bytes::from("value 8@0x40")),
+                Lsn(0x48),
+                Value::Image(Bytes::from("value 8@0x48")),
             ),
             (
                 get_key(9),
-                Lsn(0x40),
-                Value::Image(Bytes::from("value 9@0x40")),
+                Lsn(0x48),
+                Value::Image(Bytes::from("value 9@0x48")),
             ),
         ];
 
@@ -6637,7 +6682,11 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
-                vec![delta1, delta2, delta3], // delta layers
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
                 vec![(Lsn(0x10), img_layer)], // image layers
                 Lsn(0x50),
             )
@@ -6658,8 +6707,8 @@ mod tests {
             Bytes::from_static(b"value 5@0x20"),
             Bytes::from_static(b"value 6@0x20"),
             Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x40"),
-            Bytes::from_static(b"value 9@0x40"),
+            Bytes::from_static(b"value 8@0x48"),
+            Bytes::from_static(b"value 9@0x48"),
         ];
 
         for (idx, expected) in expected_result.iter().enumerate() {
@@ -6747,10 +6796,10 @@ mod tests {
                     lsn_range: Lsn(0x30)..Lsn(0x41),
                     is_delta: true
                 },
-                // The delta layer we created and should not be picked for the compaction
+                // The delta3 layer that should not be picked for the compaction
                 PersistentLayerKey {
                     key_range: get_key(8)..get_key(10),
-                    lsn_range: Lsn(0x40)..Lsn(0x41),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
                     is_delta: true
                 }
             ]
@@ -6814,7 +6863,10 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
-                vec![delta1],              // delta layers
+                vec![DeltaLayerTestDesc::new_with_inferred_key_range(
+                    Lsn(0x10)..Lsn(0x40),
+                    delta1,
+                )], // delta layers
                 vec![(Lsn(0x10), image1)], // image layers
                 Lsn(0x50),
             )
@@ -6938,15 +6990,21 @@ mod tests {
             key
         }
 
-        // We create one bottom-most image layer, a delta layer D1 crossing the GC horizon, D2 below the horizon, and D3 above the horizon.
+        // We create
+        // - one bottom-most image layer,
+        // - a delta layer D1 crossing the GC horizon with data below and above the horizon,
+        // - a delta layer D2 crossing the GC horizon with data only below the horizon,
+        // - a delta layer D3 above the horizon.
         //
-        //  | D1 |                       | D3 |
+        //                             | D3 |
+        //  | D1 |
         // -|    |-- gc horizon -----------------
         //  |    |                | D2 |
         // --------- img layer ------------------
         //
         // What we should expact from this compaction is:
-        //  | Part of D1 |               | D3 |
+        //                             | D3 |
+        //  | Part of D1 |
         // --------- img layer with D1+D2 at GC horizon------------------
 
         // img layer at 0x10
@@ -6996,13 +7054,13 @@ mod tests {
         let delta3 = vec![
             (
                 get_key(8),
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
             ),
             (
                 get_key(9),
-                Lsn(0x40),
-                Value::WalRecord(NeonWalRecord::wal_append("@0x40")),
+                Lsn(0x48),
+                Value::WalRecord(NeonWalRecord::wal_append("@0x48")),
             ),
         ];
 
@@ -7012,7 +7070,11 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
-                vec![delta1, delta2, delta3], // delta layers
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
                 vec![(Lsn(0x10), img_layer)], // image layers
                 Lsn(0x50),
             )
@@ -7027,6 +7089,7 @@ mod tests {
                     horizon: Lsn(0x30),
                 },
                 leases: Default::default(),
+                within_ancestor_pitr: false,
             };
         }
 
@@ -7039,8 +7102,8 @@ mod tests {
             Bytes::from_static(b"value 5@0x10@0x20"),
             Bytes::from_static(b"value 6@0x10@0x20"),
             Bytes::from_static(b"value 7@0x10"),
-            Bytes::from_static(b"value 8@0x10@0x40"),
-            Bytes::from_static(b"value 9@0x10@0x40"),
+            Bytes::from_static(b"value 8@0x10@0x48"),
+            Bytes::from_static(b"value 9@0x10@0x48"),
         ];
 
         let expected_result_at_gc_horizon = [
diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs
index 2be8816cefbd..e98ed66ef998 100644
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -6,13 +6,20 @@
 //! is written as a one byte. If it's larger than that, the length
 //! is written as a four-byte integer, in big-endian, with the high
 //! bit set. This way, we can detect whether it's 1- or 4-byte header
-//! by peeking at the first byte.
+//! by peeking at the first byte. For blobs larger than 128 bits,
+//! we also specify three reserved bits, only one of the three bit
+//! patterns is currently in use (0b011) and signifies compression
+//! with zstd.
 //!
 //! len <  128: 0XXXXXXX
-//! len >= 128: 1XXXXXXX XXXXXXXX XXXXXXXX XXXXXXXX
+//! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
+use async_compression::Level;
 use bytes::{BufMut, BytesMut};
+use pageserver_api::models::ImageCompressionAlgorithm;
+use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+use tracing::warn;
 
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
@@ -66,12 +73,37 @@ impl<'a> BlockCursor<'a> {
                 len_buf.copy_from_slice(&buf[off..off + 4]);
                 off += 4;
             }
-            len_buf[0] &= 0x7f;
+            let bit_mask = if self.read_compressed {
+                !LEN_COMPRESSION_BIT_MASK
+            } else {
+                0x7f
+            };
+            len_buf[0] &= bit_mask;
             u32::from_be_bytes(len_buf) as usize
         };
+        let compression_bits = first_len_byte & LEN_COMPRESSION_BIT_MASK;
 
-        dstbuf.clear();
-        dstbuf.reserve(len);
+        let mut tmp_buf = Vec::new();
+        let buf_to_write;
+        let compression = if compression_bits <= BYTE_UNCOMPRESSED || !self.read_compressed {
+            if compression_bits > BYTE_UNCOMPRESSED {
+                warn!("reading key above future limit ({len} bytes)");
+            }
+            buf_to_write = dstbuf;
+            None
+        } else if compression_bits == BYTE_ZSTD {
+            buf_to_write = &mut tmp_buf;
+            Some(dstbuf)
+        } else {
+            let error = std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                format!("invalid compression byte {compression_bits:x}"),
+            );
+            return Err(error);
+        };
+
+        buf_to_write.clear();
+        buf_to_write.reserve(len);
 
         // Read the payload
         let mut remain = len;
@@ -85,14 +117,35 @@ impl<'a> BlockCursor<'a> {
                 page_remain = PAGE_SZ;
             }
             let this_blk_len = min(remain, page_remain);
-            dstbuf.extend_from_slice(&buf[off..off + this_blk_len]);
+            buf_to_write.extend_from_slice(&buf[off..off + this_blk_len]);
             remain -= this_blk_len;
             off += this_blk_len;
         }
+
+        if let Some(dstbuf) = compression {
+            if compression_bits == BYTE_ZSTD {
+                let mut decoder = async_compression::tokio::write::ZstdDecoder::new(dstbuf);
+                decoder.write_all(buf_to_write).await?;
+                decoder.flush().await?;
+            } else {
+                unreachable!("already checked above")
+            }
+        }
+
         Ok(())
     }
 }
 
+/// Reserved bits for length and compression
+const LEN_COMPRESSION_BIT_MASK: u8 = 0xf0;
+
+/// The maximum size of blobs we support. The highest few bits
+/// are reserved for compression and other further uses.
+const MAX_SUPPORTED_LEN: usize = 0x0fff_ffff;
+
+const BYTE_UNCOMPRESSED: u8 = 0x80;
+const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
+
 /// A wrapper of `VirtualFile` that allows users to write blobs.
 ///
 /// If a `BlobWriter` is dropped, the internal buffer will be
@@ -219,6 +272,18 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
         &mut self,
         srcbuf: B,
         ctx: &RequestContext,
+    ) -> (B::Buf, Result<u64, Error>) {
+        self.write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
+            .await
+    }
+
+    /// Write a blob of data. Returns the offset that it was written to,
+    /// which can be used to retrieve the data later.
+    pub async fn write_blob_maybe_compressed<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
+        &mut self,
+        srcbuf: B,
+        ctx: &RequestContext,
+        algorithm: ImageCompressionAlgorithm,
     ) -> (B::Buf, Result<u64, Error>) {
         let offset = self.offset;
 
@@ -226,29 +291,60 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
 
         let mut io_buf = self.io_buf.take().expect("we always put it back below");
         io_buf.clear();
-        let (io_buf, hdr_res) = async {
+        let mut compressed_buf = None;
+        let ((io_buf, hdr_res), srcbuf) = async {
             if len < 128 {
                 // Short blob. Write a 1-byte length header
                 io_buf.put_u8(len as u8);
-                self.write_all(io_buf, ctx).await
+                (
+                    self.write_all(io_buf, ctx).await,
+                    srcbuf.slice_full().into_inner(),
+                )
             } else {
                 // Write a 4-byte length header
-                if len > 0x7fff_ffff {
+                if len > MAX_SUPPORTED_LEN {
                     return (
-                        io_buf,
-                        Err(Error::new(
-                            ErrorKind::Other,
-                            format!("blob too large ({len} bytes)"),
-                        )),
+                        (
+                            io_buf,
+                            Err(Error::new(
+                                ErrorKind::Other,
+                                format!("blob too large ({len} bytes)"),
+                            )),
+                        ),
+                        srcbuf.slice_full().into_inner(),
                     );
                 }
-                if len > 0x0fff_ffff {
-                    tracing::warn!("writing blob above future limit ({len} bytes)");
-                }
-                let mut len_buf = (len as u32).to_be_bytes();
-                len_buf[0] |= 0x80;
+                let (high_bit_mask, len_written, srcbuf) = match algorithm {
+                    ImageCompressionAlgorithm::Zstd { level } => {
+                        let mut encoder = if let Some(level) = level {
+                            async_compression::tokio::write::ZstdEncoder::with_quality(
+                                Vec::new(),
+                                Level::Precise(level.into()),
+                            )
+                        } else {
+                            async_compression::tokio::write::ZstdEncoder::new(Vec::new())
+                        };
+                        let slice = srcbuf.slice_full();
+                        encoder.write_all(&slice[..]).await.unwrap();
+                        encoder.shutdown().await.unwrap();
+                        let compressed = encoder.into_inner();
+                        if compressed.len() < len {
+                            let compressed_len = compressed.len();
+                            compressed_buf = Some(compressed);
+                            (BYTE_ZSTD, compressed_len, slice.into_inner())
+                        } else {
+                            (BYTE_UNCOMPRESSED, len, slice.into_inner())
+                        }
+                    }
+                    ImageCompressionAlgorithm::Disabled => {
+                        (BYTE_UNCOMPRESSED, len, srcbuf.slice_full().into_inner())
+                    }
+                };
+                let mut len_buf = (len_written as u32).to_be_bytes();
+                assert_eq!(len_buf[0] & 0xf0, 0);
+                len_buf[0] |= high_bit_mask;
                 io_buf.extend_from_slice(&len_buf[..]);
-                self.write_all(io_buf, ctx).await
+                (self.write_all(io_buf, ctx).await, srcbuf)
             }
         }
         .await;
@@ -257,7 +353,12 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
             Ok(_) => (),
             Err(e) => return (Slice::into_inner(srcbuf.slice(..)), Err(e)),
         }
-        let (srcbuf, res) = self.write_all(srcbuf, ctx).await;
+        let (srcbuf, res) = if let Some(compressed_buf) = compressed_buf {
+            let (_buf, res) = self.write_all(compressed_buf, ctx).await;
+            (Slice::into_inner(srcbuf.slice(..)), res)
+        } else {
+            self.write_all(srcbuf, ctx).await
+        };
         (srcbuf, res.map(|_| offset))
     }
 }
@@ -295,6 +396,13 @@ mod tests {
     use rand::{Rng, SeedableRng};
 
     async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
+        round_trip_test_compressed::<BUFFERED>(blobs, false).await
+    }
+
+    async fn round_trip_test_compressed<const BUFFERED: bool>(
+        blobs: &[Vec<u8>],
+        compression: bool,
+    ) -> Result<(), Error> {
         let temp_dir = camino_tempfile::tempdir()?;
         let pathbuf = temp_dir.path().join("file");
         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
@@ -305,7 +413,16 @@ mod tests {
             let file = VirtualFile::create(pathbuf.as_path(), &ctx).await?;
             let mut wtr = BlobWriter::<BUFFERED>::new(file, 0);
             for blob in blobs.iter() {
-                let (_, res) = wtr.write_blob(blob.clone(), &ctx).await;
+                let (_, res) = if compression {
+                    wtr.write_blob_maybe_compressed(
+                        blob.clone(),
+                        &ctx,
+                        ImageCompressionAlgorithm::Zstd { level: Some(1) },
+                    )
+                    .await
+                } else {
+                    wtr.write_blob(blob.clone(), &ctx).await
+                };
                 let offs = res?;
                 offsets.push(offs);
             }
@@ -319,7 +436,7 @@ mod tests {
 
         let file = VirtualFile::open(pathbuf.as_path(), &ctx).await?;
         let rdr = BlockReaderRef::VirtualFile(&file);
-        let rdr = BlockCursor::new(rdr);
+        let rdr = BlockCursor::new_with_compression(rdr, compression);
         for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
             let blob_read = rdr.read_blob(*offset, &ctx).await?;
             assert_eq!(
@@ -353,6 +470,8 @@ mod tests {
         ];
         round_trip_test::<false>(blobs).await?;
         round_trip_test::<true>(blobs).await?;
+        round_trip_test_compressed::<false>(blobs, true).await?;
+        round_trip_test_compressed::<true>(blobs, true).await?;
         Ok(())
     }
 
@@ -361,10 +480,15 @@ mod tests {
         let blobs = &[
             b"test".to_vec(),
             random_array(10 * PAGE_SZ),
+            b"hello".to_vec(),
+            random_array(66 * PAGE_SZ),
+            vec![0xf3; 24 * PAGE_SZ],
             b"foobar".to_vec(),
         ];
         round_trip_test::<false>(blobs).await?;
         round_trip_test::<true>(blobs).await?;
+        round_trip_test_compressed::<false>(blobs, true).await?;
+        round_trip_test_compressed::<true>(blobs, true).await?;
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index b406d5033243..601b09515519 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -37,6 +37,7 @@ where
 pub enum BlockLease<'a> {
     PageReadGuard(PageReadGuard<'static>),
     EphemeralFileMutableTail(&'a [u8; PAGE_SZ]),
+    Slice(&'a [u8; PAGE_SZ]),
     #[cfg(test)]
     Arc(std::sync::Arc<[u8; PAGE_SZ]>),
     #[cfg(test)]
@@ -63,6 +64,7 @@ impl<'a> Deref for BlockLease<'a> {
         match self {
             BlockLease::PageReadGuard(v) => v.deref(),
             BlockLease::EphemeralFileMutableTail(v) => v,
+            BlockLease::Slice(v) => v,
             #[cfg(test)]
             BlockLease::Arc(v) => v.deref(),
             #[cfg(test)]
@@ -81,6 +83,7 @@ pub(crate) enum BlockReaderRef<'a> {
     FileBlockReader(&'a FileBlockReader<'a>),
     EphemeralFile(&'a EphemeralFile),
     Adapter(Adapter<&'a DeltaLayerInner>),
+    Slice(&'a [u8]),
     #[cfg(test)]
     TestDisk(&'a super::disk_btree::tests::TestDisk),
     #[cfg(test)]
@@ -99,6 +102,7 @@ impl<'a> BlockReaderRef<'a> {
             FileBlockReader(r) => r.read_blk(blknum, ctx).await,
             EphemeralFile(r) => r.read_blk(blknum, ctx).await,
             Adapter(r) => r.read_blk(blknum, ctx).await,
+            Slice(s) => Self::read_blk_slice(s, blknum),
             #[cfg(test)]
             TestDisk(r) => r.read_blk(blknum),
             #[cfg(test)]
@@ -107,6 +111,24 @@ impl<'a> BlockReaderRef<'a> {
     }
 }
 
+impl<'a> BlockReaderRef<'a> {
+    fn read_blk_slice(slice: &[u8], blknum: u32) -> std::io::Result<BlockLease> {
+        let start = (blknum as usize).checked_mul(PAGE_SZ).unwrap();
+        let end = start.checked_add(PAGE_SZ).unwrap();
+        if end > slice.len() {
+            return Err(std::io::Error::new(
+                std::io::ErrorKind::UnexpectedEof,
+                format!("slice too short, len={} end={}", slice.len(), end),
+            ));
+        }
+        let slice = &slice[start..end];
+        let page_sized: &[u8; PAGE_SZ] = slice
+            .try_into()
+            .expect("we add PAGE_SZ to start, so the slice must have PAGE_SZ");
+        Ok(BlockLease::Slice(page_sized))
+    }
+}
+
 ///
 /// A "cursor" for efficiently reading multiple pages from a BlockReader
 ///
@@ -127,16 +149,24 @@ impl<'a> BlockReaderRef<'a> {
 /// ```
 ///
 pub struct BlockCursor<'a> {
+    pub(super) read_compressed: bool,
     reader: BlockReaderRef<'a>,
 }
 
 impl<'a> BlockCursor<'a> {
     pub(crate) fn new(reader: BlockReaderRef<'a>) -> Self {
-        BlockCursor { reader }
+        Self::new_with_compression(reader, false)
+    }
+    pub(crate) fn new_with_compression(reader: BlockReaderRef<'a>, read_compressed: bool) -> Self {
+        BlockCursor {
+            read_compressed,
+            reader,
+        }
     }
     // Needed by cli
     pub fn new_fileblockreader(reader: &'a FileBlockReader) -> Self {
         BlockCursor {
+            read_compressed: false,
             reader: BlockReaderRef::FileBlockReader(reader),
         }
     }
@@ -166,11 +196,17 @@ pub struct FileBlockReader<'a> {
 
     /// Unique ID of this file, used as key in the page cache.
     file_id: page_cache::FileId,
+
+    compressed_reads: bool,
 }
 
 impl<'a> FileBlockReader<'a> {
     pub fn new(file: &'a VirtualFile, file_id: FileId) -> Self {
-        FileBlockReader { file_id, file }
+        FileBlockReader {
+            file_id,
+            file,
+            compressed_reads: true,
+        }
     }
 
     /// Read a page from the underlying file into given buffer.
@@ -217,7 +253,10 @@ impl<'a> FileBlockReader<'a> {
 
 impl BlockReader for FileBlockReader<'_> {
     fn block_cursor(&self) -> BlockCursor<'_> {
-        BlockCursor::new(BlockReaderRef::FileBlockReader(self))
+        BlockCursor::new_with_compression(
+            BlockReaderRef::FileBlockReader(self),
+            self.compressed_reads,
+        )
     }
 }
 
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 79cc7bf15373..bb65ae24fc5e 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -21,6 +21,7 @@ pub struct EphemeralFile {
 }
 
 mod page_caching;
+pub(crate) use page_caching::PrewarmOnWrite as PrewarmPageCacheOnWrite;
 mod zero_padded_read_write;
 
 impl EphemeralFile {
@@ -53,7 +54,7 @@ impl EphemeralFile {
         Ok(EphemeralFile {
             _tenant_shard_id: tenant_shard_id,
             _timeline_id: timeline_id,
-            rw: page_caching::RW::new(file),
+            rw: page_caching::RW::new(file, conf.l0_flush.prewarm_on_write()),
         })
     }
 
@@ -65,6 +66,11 @@ impl EphemeralFile {
         self.rw.page_cache_file_id()
     }
 
+    /// See [`self::page_caching::RW::load_to_vec`].
+    pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
+        self.rw.load_to_vec(ctx).await
+    }
+
     pub(crate) async fn read_blk(
         &self,
         blknum: u32,
diff --git a/pageserver/src/tenant/ephemeral_file/page_caching.rs b/pageserver/src/tenant/ephemeral_file/page_caching.rs
index 276ac8706493..43b9fff28d98 100644
--- a/pageserver/src/tenant/ephemeral_file/page_caching.rs
+++ b/pageserver/src/tenant/ephemeral_file/page_caching.rs
@@ -8,6 +8,7 @@ use crate::virtual_file::VirtualFile;
 
 use once_cell::sync::Lazy;
 use std::io::{self, ErrorKind};
+use std::ops::{Deref, Range};
 use tokio_epoll_uring::BoundedBuf;
 use tracing::*;
 
@@ -19,14 +20,23 @@ pub struct RW {
     rw: super::zero_padded_read_write::RW<PreWarmingWriter>,
 }
 
+/// When we flush a block to the underlying [`crate::virtual_file::VirtualFile`],
+/// should we pre-warm the [`crate::page_cache`] with the contents?
+#[derive(Clone, Copy)]
+pub enum PrewarmOnWrite {
+    Yes,
+    No,
+}
+
 impl RW {
-    pub fn new(file: VirtualFile) -> Self {
+    pub fn new(file: VirtualFile, prewarm_on_write: PrewarmOnWrite) -> Self {
         let page_cache_file_id = page_cache::next_file_id();
         Self {
             page_cache_file_id,
             rw: super::zero_padded_read_write::RW::new(PreWarmingWriter::new(
                 page_cache_file_id,
                 file,
+                prewarm_on_write,
             )),
         }
     }
@@ -49,6 +59,43 @@ impl RW {
         self.rw.bytes_written()
     }
 
+    /// Load all blocks that can be read via [`Self::read_blk`] into a contiguous memory buffer.
+    ///
+    /// This includes the blocks that aren't yet flushed to disk by the internal buffered writer.
+    /// The last block is zero-padded to [`PAGE_SZ`], so, the returned buffer is always a multiple of [`PAGE_SZ`].
+    pub(super) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
+        // round up to the next PAGE_SZ multiple, required by blob_io
+        let size = {
+            let s = usize::try_from(self.bytes_written()).unwrap();
+            if s % PAGE_SZ == 0 {
+                s
+            } else {
+                s.checked_add(PAGE_SZ - (s % PAGE_SZ)).unwrap()
+            }
+        };
+        let vec = Vec::with_capacity(size);
+
+        // read from disk what we've already flushed
+        let writer = self.rw.as_writer();
+        let flushed_range = writer.written_range();
+        let mut vec = writer
+            .file
+            .read_exact_at(
+                vec.slice(0..(flushed_range.end - flushed_range.start)),
+                u64::try_from(flushed_range.start).unwrap(),
+                ctx,
+            )
+            .await?
+            .into_inner();
+
+        // copy from in-memory buffer what we haven't flushed yet but would return when accessed via read_blk
+        let buffered = self.rw.get_tail_zero_padded();
+        vec.extend_from_slice(buffered);
+        assert_eq!(vec.len(), size);
+        assert_eq!(vec.len() % PAGE_SZ, 0);
+        Ok(vec)
+    }
+
     pub(crate) async fn read_blk(
         &self,
         blknum: u32,
@@ -116,19 +163,40 @@ impl Drop for RW {
 }
 
 struct PreWarmingWriter {
+    prewarm_on_write: PrewarmOnWrite,
     nwritten_blocks: u32,
     page_cache_file_id: page_cache::FileId,
     file: VirtualFile,
 }
 
 impl PreWarmingWriter {
-    fn new(page_cache_file_id: page_cache::FileId, file: VirtualFile) -> Self {
+    fn new(
+        page_cache_file_id: page_cache::FileId,
+        file: VirtualFile,
+        prewarm_on_write: PrewarmOnWrite,
+    ) -> Self {
         Self {
+            prewarm_on_write,
             nwritten_blocks: 0,
             page_cache_file_id,
             file,
         }
     }
+
+    /// Return the byte range within `file` that has been written though `write_all`.
+    ///
+    /// The returned range would be invalidated by another `write_all`. To prevent that, we capture `&_`.
+    fn written_range(&self) -> (impl Deref<Target = Range<usize>> + '_) {
+        let nwritten_blocks = usize::try_from(self.nwritten_blocks).unwrap();
+        struct Wrapper(Range<usize>);
+        impl Deref for Wrapper {
+            type Target = Range<usize>;
+            fn deref(&self) -> &Range<usize> {
+                &self.0
+            }
+        }
+        Wrapper(0..nwritten_blocks * PAGE_SZ)
+    }
 }
 
 impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmingWriter {
@@ -178,45 +246,51 @@ impl crate::virtual_file::owned_buffers_io::write::OwnedAsyncWriter for PreWarmi
             assert_eq!(&check_bounds_stuff_works, &*buf);
         }
 
-        // Pre-warm page cache with the contents.
-        // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
-        // benefits the code that writes InMemoryLayer=>L0 layers.
         let nblocks = buflen / PAGE_SZ;
         let nblocks32 = u32::try_from(nblocks).unwrap();
-        let cache = page_cache::get();
-        static CTX: Lazy<RequestContext> = Lazy::new(|| {
-            RequestContext::new(
-                crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
-                crate::context::DownloadBehavior::Error,
-            )
-        });
-        for blknum_in_buffer in 0..nblocks {
-            let blk_in_buffer = &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
-            let blknum = self
-                .nwritten_blocks
-                .checked_add(blknum_in_buffer as u32)
-                .unwrap();
-            match cache
-                .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
-                .await
-            {
-                Err(e) => {
-                    error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
-                    // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
-                }
-                Ok(v) => match v {
-                    page_cache::ReadBufResult::Found(_guard) => {
-                        // This function takes &mut self, so, it shouldn't be possible to reach this point.
-                        unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
-                                      and this function takes &mut self, so, no concurrent read_blk is possible");
-                    }
-                    page_cache::ReadBufResult::NotFound(mut write_guard) => {
-                        write_guard.copy_from_slice(blk_in_buffer);
-                        let _ = write_guard.mark_valid();
+
+        if matches!(self.prewarm_on_write, PrewarmOnWrite::Yes) {
+            // Pre-warm page cache with the contents.
+            // At least in isolated bulk ingest benchmarks (test_bulk_insert.py), the pre-warming
+            // benefits the code that writes InMemoryLayer=>L0 layers.
+
+            let cache = page_cache::get();
+            static CTX: Lazy<RequestContext> = Lazy::new(|| {
+                RequestContext::new(
+                    crate::task_mgr::TaskKind::EphemeralFilePreWarmPageCache,
+                    crate::context::DownloadBehavior::Error,
+                )
+            });
+            for blknum_in_buffer in 0..nblocks {
+                let blk_in_buffer =
+                    &buf[blknum_in_buffer * PAGE_SZ..(blknum_in_buffer + 1) * PAGE_SZ];
+                let blknum = self
+                    .nwritten_blocks
+                    .checked_add(blknum_in_buffer as u32)
+                    .unwrap();
+                match cache
+                    .read_immutable_buf(self.page_cache_file_id, blknum, &CTX)
+                    .await
+                {
+                    Err(e) => {
+                        error!("ephemeral_file write_blob failed to get immutable buf to pre-warm page cache: {e:?}");
+                        // fail gracefully, it's not the end of the world if we can't pre-warm the cache here
                     }
-                },
+                    Ok(v) => match v {
+                        page_cache::ReadBufResult::Found(_guard) => {
+                            // This function takes &mut self, so, it shouldn't be possible to reach this point.
+                            unreachable!("we just wrote block {blknum} to the VirtualFile, which is owned by Self, \
+                                      and this function takes &mut self, so, no concurrent read_blk is possible");
+                        }
+                        page_cache::ReadBufResult::NotFound(mut write_guard) => {
+                            write_guard.copy_from_slice(blk_in_buffer);
+                            let _ = write_guard.mark_valid();
+                        }
+                    },
+                }
             }
         }
+
         self.nwritten_blocks = self.nwritten_blocks.checked_add(nblocks32).unwrap();
         Ok((buflen, buf.into_inner()))
     }
diff --git a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
index b37eafb52c5b..fe310acab888 100644
--- a/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
+++ b/pageserver/src/tenant/ephemeral_file/zero_padded_read_write.rs
@@ -75,6 +75,21 @@ where
         flushed_offset + u64::try_from(buffer.pending()).unwrap()
     }
 
+    /// Get a slice of all blocks that [`Self::read_blk`] would return as [`ReadResult::ServedFromZeroPaddedMutableTail`].
+    pub fn get_tail_zero_padded(&self) -> &[u8] {
+        let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
+        let buffer_written_up_to = buffer.pending();
+        // pad to next page boundary
+        let read_up_to = if buffer_written_up_to % PAGE_SZ == 0 {
+            buffer_written_up_to
+        } else {
+            buffer_written_up_to
+                .checked_add(PAGE_SZ - (buffer_written_up_to % PAGE_SZ))
+                .unwrap()
+        };
+        &buffer.as_zero_padded_slice()[0..read_up_to]
+    }
+
     pub(crate) async fn read_blk(&self, blknum: u32) -> Result<ReadResult<'_, W>, std::io::Error> {
         let flushed_offset = self.buffered_writer.as_inner().bytes_written();
         let buffer: &zero_padded::Buffer<TAIL_SZ> = self.buffered_writer.inspect_buffer();
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 08c3f19b6f75..b0159e22bfc0 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -43,7 +43,8 @@ use crate::tenant::config::{
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, GcError, SpawnMode, Tenant, TenantState};
+use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState};
+use crate::virtual_file::MaybeFatalIo;
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
 
 use utils::crashsafe::path_with_suffix_extension;
@@ -272,7 +273,7 @@ pub struct TenantManager {
 }
 
 fn emergency_generations(
-    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
+    tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
 ) -> HashMap<TenantShardId, TenantStartupMode> {
     tenant_confs
         .iter()
@@ -296,7 +297,7 @@ fn emergency_generations(
 
 async fn init_load_generations(
     conf: &'static PageServerConf,
-    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
+    tenant_confs: &HashMap<TenantShardId, Result<LocationConf, LoadConfigError>>,
     resources: &TenantSharedResources,
     cancel: &CancellationToken,
 ) -> anyhow::Result<Option<HashMap<TenantShardId, TenantStartupMode>>> {
@@ -346,56 +347,32 @@ async fn init_load_generations(
 /// Given a directory discovered in the pageserver's tenants/ directory, attempt
 /// to load a tenant config from it.
 ///
-/// If file is missing, return Ok(None)
+/// If we cleaned up something expected (like an empty dir or a temp dir), return None.
 fn load_tenant_config(
     conf: &'static PageServerConf,
+    tenant_shard_id: TenantShardId,
     dentry: Utf8DirEntry,
-) -> anyhow::Result<Option<(TenantShardId, anyhow::Result<LocationConf>)>> {
+) -> Option<Result<LocationConf, LoadConfigError>> {
     let tenant_dir_path = dentry.path().to_path_buf();
     if crate::is_temporary(&tenant_dir_path) {
         info!("Found temporary tenant directory, removing: {tenant_dir_path}");
         // No need to use safe_remove_tenant_dir_all because this is already
         // a temporary path
-        if let Err(e) = std::fs::remove_dir_all(&tenant_dir_path) {
-            error!(
-                "Failed to remove temporary directory '{}': {:?}",
-                tenant_dir_path, e
-            );
-        }
-        return Ok(None);
+        std::fs::remove_dir_all(&tenant_dir_path).fatal_err("delete temporary tenant dir");
+        return None;
     }
 
     // This case happens if we crash during attachment before writing a config into the dir
     let is_empty = tenant_dir_path
         .is_empty_dir()
-        .with_context(|| format!("Failed to check whether {tenant_dir_path:?} is an empty dir"))?;
+        .fatal_err("Checking for empty tenant dir");
     if is_empty {
         info!("removing empty tenant directory {tenant_dir_path:?}");
-        if let Err(e) = std::fs::remove_dir(&tenant_dir_path) {
-            error!(
-                "Failed to remove empty tenant directory '{}': {e:#}",
-                tenant_dir_path
-            )
-        }
-        return Ok(None);
+        std::fs::remove_dir(&tenant_dir_path).fatal_err("delete empty tenant dir");
+        return None;
     }
 
-    let tenant_shard_id = match tenant_dir_path
-        .file_name()
-        .unwrap_or_default()
-        .parse::<TenantShardId>()
-    {
-        Ok(id) => id,
-        Err(_) => {
-            warn!("Invalid tenant path (garbage in our repo directory?): {tenant_dir_path}",);
-            return Ok(None);
-        }
-    };
-
-    Ok(Some((
-        tenant_shard_id,
-        Tenant::load_tenant_config(conf, &tenant_shard_id),
-    )))
+    Some(Tenant::load_tenant_config(conf, &tenant_shard_id))
 }
 
 /// Initial stage of load: walk the local tenants directory, clean up any temp files,
@@ -405,32 +382,51 @@ fn load_tenant_config(
 /// seconds even on reasonably fast drives.
 async fn init_load_tenant_configs(
     conf: &'static PageServerConf,
-) -> anyhow::Result<HashMap<TenantShardId, anyhow::Result<LocationConf>>> {
+) -> HashMap<TenantShardId, Result<LocationConf, LoadConfigError>> {
     let tenants_dir = conf.tenants_path();
 
-    let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
-        let dir_entries = tenants_dir
-            .read_dir_utf8()
-            .with_context(|| format!("Failed to list tenants dir {tenants_dir:?}"))?;
+    let dentries = tokio::task::spawn_blocking(move || -> Vec<Utf8DirEntry> {
+        let context = format!("read tenants dir {tenants_dir}");
+        let dir_entries = tenants_dir.read_dir_utf8().fatal_err(&context);
 
-        Ok(dir_entries.collect::<Result<Vec<_>, std::io::Error>>()?)
+        dir_entries
+            .collect::<Result<Vec<_>, std::io::Error>>()
+            .fatal_err(&context)
     })
-    .await??;
+    .await
+    .expect("Config load task panicked");
 
     let mut configs = HashMap::new();
 
     let mut join_set = JoinSet::new();
     for dentry in dentries {
-        join_set.spawn_blocking(move || load_tenant_config(conf, dentry));
+        let tenant_shard_id = match dentry.file_name().parse::<TenantShardId>() {
+            Ok(id) => id,
+            Err(_) => {
+                warn!(
+                    "Invalid tenant path (garbage in our repo directory?): '{}'",
+                    dentry.file_name()
+                );
+                continue;
+            }
+        };
+
+        join_set.spawn_blocking(move || {
+            (
+                tenant_shard_id,
+                load_tenant_config(conf, tenant_shard_id, dentry),
+            )
+        });
     }
 
     while let Some(r) = join_set.join_next().await {
-        if let Some((tenant_id, tenant_config)) = r?? {
-            configs.insert(tenant_id, tenant_config);
+        let (tenant_shard_id, tenant_config) = r.expect("Panic in config load task");
+        if let Some(tenant_config) = tenant_config {
+            configs.insert(tenant_shard_id, tenant_config);
         }
     }
 
-    Ok(configs)
+    configs
 }
 
 #[derive(Debug, thiserror::Error)]
@@ -472,7 +468,7 @@ pub async fn init_tenant_mgr(
     );
 
     // Scan local filesystem for attached tenants
-    let tenant_configs = init_load_tenant_configs(conf).await?;
+    let tenant_configs = init_load_tenant_configs(conf).await;
 
     // Determine which tenants are to be secondary or attached, and in which generation
     let tenant_modes = init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
@@ -590,31 +586,23 @@ pub async fn init_tenant_mgr(
     );
     // For those shards that have live configurations, construct `Tenant` or `SecondaryTenant` objects and start them running
     for (tenant_shard_id, location_conf, config_write_result) in config_write_results {
-        // Errors writing configs are fatal
-        config_write_result?;
+        // Writing a config to local disk is foundational to startup up tenants: panic if we can't.
+        config_write_result.fatal_err("write tenant shard config file");
 
         let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
         let shard_identity = location_conf.shard;
         let slot = match location_conf.mode {
-            LocationMode::Attached(attached_conf) => {
-                match tenant_spawn(
-                    conf,
-                    tenant_shard_id,
-                    &tenant_dir_path,
-                    resources.clone(),
-                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
-                    shard_identity,
-                    Some(init_order.clone()),
-                    SpawnMode::Lazy,
-                    &ctx,
-                ) {
-                    Ok(tenant) => TenantSlot::Attached(tenant),
-                    Err(e) => {
-                        error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
-                        continue;
-                    }
-                }
-            }
+            LocationMode::Attached(attached_conf) => TenantSlot::Attached(tenant_spawn(
+                conf,
+                tenant_shard_id,
+                &tenant_dir_path,
+                resources.clone(),
+                AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                shard_identity,
+                Some(init_order.clone()),
+                SpawnMode::Lazy,
+                &ctx,
+            )),
             LocationMode::Secondary(secondary_conf) => {
                 info!(
                     tenant_id = %tenant_shard_id.tenant_id,
@@ -649,8 +637,7 @@ pub async fn init_tenant_mgr(
     })
 }
 
-/// Wrapper for Tenant::spawn that checks invariants before running, and inserts
-/// a broken tenant in the map if Tenant::spawn fails.
+/// Wrapper for Tenant::spawn that checks invariants before running
 #[allow(clippy::too_many_arguments)]
 fn tenant_spawn(
     conf: &'static PageServerConf,
@@ -662,23 +649,18 @@ fn tenant_spawn(
     init_order: Option<InitializationOrder>,
     mode: SpawnMode,
     ctx: &RequestContext,
-) -> anyhow::Result<Arc<Tenant>> {
-    anyhow::ensure!(
-        tenant_path.is_dir(),
-        "Cannot load tenant from path {tenant_path:?}, it either does not exist or not a directory"
-    );
-    anyhow::ensure!(
-        !crate::is_temporary(tenant_path),
-        "Cannot load tenant from temporary path {tenant_path:?}"
-    );
-    anyhow::ensure!(
-        !tenant_path.is_empty_dir().with_context(|| {
-            format!("Failed to check whether {tenant_path:?} is an empty dir")
-        })?,
-        "Cannot load tenant from empty directory {tenant_path:?}"
-    );
-
-    let tenant = Tenant::spawn(
+) -> Arc<Tenant> {
+    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
+    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
+    // to avoid impacting prod runtime performance.
+    assert!(!crate::is_temporary(tenant_path));
+    debug_assert!(tenant_path.is_dir());
+    debug_assert!(conf
+        .tenant_location_config_path(&tenant_shard_id)
+        .try_exists()
+        .unwrap());
+
+    Tenant::spawn(
         conf,
         tenant_shard_id,
         resources,
@@ -687,9 +669,7 @@ fn tenant_spawn(
         init_order,
         mode,
         ctx,
-    );
-
-    Ok(tenant)
+    )
 }
 
 async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
@@ -840,8 +820,9 @@ pub(crate) enum UpsertLocationError {
     #[error("Failed to flush: {0}")]
     Flush(anyhow::Error),
 
+    /// This error variant is for unexpected situations (soft assertions) where the system is in an unexpected state.
     #[error("Internal error: {0}")]
-    Other(#[from] anyhow::Error),
+    InternalError(anyhow::Error),
 }
 
 impl TenantManager {
@@ -971,7 +952,8 @@ impl TenantManager {
         match fast_path_taken {
             Some(FastPathModified::Attached(tenant)) => {
                 Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await?;
+                    .await
+                    .fatal_err("write tenant shard config");
 
                 // Transition to AttachedStale means we may well hold a valid generation
                 // still, and have been requested to go stale as part of a migration.  If
@@ -1001,7 +983,8 @@ impl TenantManager {
             }
             Some(FastPathModified::Secondary(_secondary_tenant)) => {
                 Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await?;
+                    .await
+                    .fatal_err("write tenant shard config");
 
                 return Ok(None);
             }
@@ -1067,7 +1050,7 @@ impl TenantManager {
             Some(TenantSlot::InProgress(_)) => {
                 // This should never happen: acquire_slot should error out
                 // if the contents of a slot were InProgress.
-                return Err(UpsertLocationError::Other(anyhow::anyhow!(
+                return Err(UpsertLocationError::InternalError(anyhow::anyhow!(
                     "Acquired an InProgress slot, this is a bug."
                 )));
             }
@@ -1086,12 +1069,14 @@ impl TenantManager {
         // Does not need to be fsync'd because local storage is just a cache.
         tokio::fs::create_dir_all(&timelines_path)
             .await
-            .with_context(|| format!("Creating {timelines_path}"))?;
+            .fatal_err("create timelines/ dir");
 
         // Before activating either secondary or attached mode, persist the
         // configuration, so that on restart we will re-attach (or re-start
         // secondary) on the tenant.
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?;
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+            .await
+            .fatal_err("write tenant shard config");
 
         let new_slot = match &new_location_config.mode {
             LocationMode::Secondary(secondary_config) => {
@@ -1110,13 +1095,15 @@ impl TenantManager {
                 // from upserts.  This enables creating generation-less tenants even though neon_local
                 // always uses generations when calling the location conf API.
                 let attached_conf = if cfg!(feature = "testing") {
-                    let mut conf = AttachedTenantConf::try_from(new_location_config)?;
+                    let mut conf = AttachedTenantConf::try_from(new_location_config)
+                        .map_err(UpsertLocationError::BadRequest)?;
                     if self.conf.control_plane_api.is_none() {
                         conf.location.generation = Generation::none();
                     }
                     conf
                 } else {
-                    AttachedTenantConf::try_from(new_location_config)?
+                    AttachedTenantConf::try_from(new_location_config)
+                        .map_err(UpsertLocationError::BadRequest)?
                 };
 
                 let tenant = tenant_spawn(
@@ -1129,7 +1116,7 @@ impl TenantManager {
                     None,
                     spawn_mode,
                     ctx,
-                )?;
+                );
 
                 TenantSlot::Attached(tenant)
             }
@@ -1143,7 +1130,7 @@ impl TenantManager {
 
         match slot_guard.upsert(new_slot) {
             Err(TenantSlotUpsertError::InternalError(e)) => {
-                Err(UpsertLocationError::Other(anyhow::anyhow!(e)))
+                Err(UpsertLocationError::InternalError(anyhow::anyhow!(e)))
             }
             Err(TenantSlotUpsertError::MapState(e)) => Err(UpsertLocationError::Unavailable(e)),
             Err(TenantSlotUpsertError::ShuttingDown((new_slot, _completion))) => {
@@ -1250,7 +1237,7 @@ impl TenantManager {
             None,
             SpawnMode::Eager,
             ctx,
-        )?;
+        );
 
         slot_guard.upsert(TenantSlot::Attached(tenant))?;
 
@@ -1984,7 +1971,7 @@ impl TenantManager {
             None,
             SpawnMode::Eager,
             ctx,
-        )?;
+        );
 
         slot_guard.upsert(TenantSlot::Attached(tenant))?;
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index e33e4b84aa97..bc9364de61d4 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -519,7 +519,7 @@ impl RemoteTimelineClient {
         local_path: &Utf8Path,
         cancel: &CancellationToken,
         ctx: &RequestContext,
-    ) -> anyhow::Result<u64> {
+    ) -> Result<u64, DownloadError> {
         let downloaded_size = {
             let _unfinished_gauge_guard = self.metrics.call_begin(
                 &RemoteOpFileKind::Layer,
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index af6840f525ae..a233d11c4a11 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -23,6 +23,8 @@ use super::{
     storage_layer::LayerName,
 };
 
+use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
+use metrics::UIntGauge;
 use pageserver_api::{
     models,
     shard::{ShardIdentity, TenantShardId},
@@ -99,6 +101,17 @@ pub(crate) struct SecondaryTenant {
 
     // Public state indicating overall progress of downloads relative to the last heatmap seen
     pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
+
+    // Sum of layer sizes on local disk
+    pub(super) resident_size_metric: UIntGauge,
+}
+
+impl Drop for SecondaryTenant {
+    fn drop(&mut self) {
+        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
+        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
+        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+    }
 }
 
 impl SecondaryTenant {
@@ -108,6 +121,12 @@ impl SecondaryTenant {
         tenant_conf: TenantConfOpt,
         config: &SecondaryLocationConfig,
     ) -> Arc<Self> {
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_id = format!("{}", tenant_shard_id.shard_slug());
+        let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id])
+            .unwrap();
+
         Arc::new(Self {
             tenant_shard_id,
             // todo: shall we make this a descendent of the
@@ -123,6 +142,8 @@ impl SecondaryTenant {
             detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
 
             progress: std::sync::Mutex::default(),
+
+            resident_size_metric,
         })
     }
 
@@ -211,16 +232,12 @@ impl SecondaryTenant {
             // have to 100% match what is on disk, because it's a best-effort warming
             // of the cache.
             let mut detail = this.detail.lock().unwrap();
-            if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
-                let removed = timeline_detail.on_disk_layers.remove(&name);
-
-                // We might race with removal of the same layer during downloads, if it was removed
-                // from the heatmap.  If we see that the OnDiskState is gone, then no need to
-                // do a physical deletion or store in evicted_at.
-                if let Some(removed) = removed {
-                    removed.remove_blocking();
-                    timeline_detail.evicted_at.insert(name, now);
-                }
+            if let Some(removed) =
+                detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric)
+            {
+                // We might race with removal of the same layer during downloads, so finding the layer we
+                // were trying to remove is optional.  Only issue the disk I/O to remove it if we found it.
+                removed.remove_blocking();
             }
         })
         .await
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 24176ecf1956..27439d4f030d 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -46,6 +46,7 @@ use crate::tenant::{
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
+use metrics::UIntGauge;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{DownloadError, Etag, GenericRemoteStorage};
@@ -131,16 +132,66 @@ impl OnDiskState {
             .or_else(fs_ext::ignore_not_found)
             .fatal_err("Deleting secondary layer")
     }
+
+    pub(crate) fn file_size(&self) -> u64 {
+        self.metadata.file_size
+    }
 }
 
 #[derive(Debug, Clone, Default)]
 pub(super) struct SecondaryDetailTimeline {
-    pub(super) on_disk_layers: HashMap<LayerName, OnDiskState>,
+    on_disk_layers: HashMap<LayerName, OnDiskState>,
 
     /// We remember when layers were evicted, to prevent re-downloading them.
     pub(super) evicted_at: HashMap<LayerName, SystemTime>,
 }
 
+impl SecondaryDetailTimeline {
+    pub(super) fn remove_layer(
+        &mut self,
+        name: &LayerName,
+        resident_metric: &UIntGauge,
+    ) -> Option<OnDiskState> {
+        let removed = self.on_disk_layers.remove(name);
+        if let Some(removed) = &removed {
+            resident_metric.sub(removed.file_size());
+        }
+        removed
+    }
+
+    /// `local_path`
+    fn touch_layer<F>(
+        &mut self,
+        conf: &'static PageServerConf,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        touched: &HeatMapLayer,
+        resident_metric: &UIntGauge,
+        local_path: F,
+    ) where
+        F: FnOnce() -> Utf8PathBuf,
+    {
+        use std::collections::hash_map::Entry;
+        match self.on_disk_layers.entry(touched.name.clone()) {
+            Entry::Occupied(mut v) => {
+                v.get_mut().access_time = touched.access_time;
+            }
+            Entry::Vacant(e) => {
+                e.insert(OnDiskState::new(
+                    conf,
+                    tenant_shard_id,
+                    timeline_id,
+                    touched.name.clone(),
+                    touched.metadata.clone(),
+                    touched.access_time,
+                    local_path(),
+                ));
+                resident_metric.add(touched.metadata.file_size);
+            }
+        }
+    }
+}
+
 // Aspects of a heatmap that we remember after downloading it
 #[derive(Clone, Debug)]
 struct DownloadSummary {
@@ -158,7 +209,7 @@ pub(super) struct SecondaryDetail {
 
     last_download: Option<DownloadSummary>,
     next_download: Option<Instant>,
-    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
+    timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
 }
 
 /// Helper for logging SystemTime
@@ -191,6 +242,38 @@ impl SecondaryDetail {
         }
     }
 
+    pub(super) fn evict_layer(
+        &mut self,
+        name: LayerName,
+        timeline_id: &TimelineId,
+        now: SystemTime,
+        resident_metric: &UIntGauge,
+    ) -> Option<OnDiskState> {
+        let timeline = self.timelines.get_mut(timeline_id)?;
+        let removed = timeline.remove_layer(&name, resident_metric);
+        if removed.is_some() {
+            timeline.evicted_at.insert(name, now);
+        }
+        removed
+    }
+
+    pub(super) fn remove_timeline(
+        &mut self,
+        timeline_id: &TimelineId,
+        resident_metric: &UIntGauge,
+    ) {
+        let removed = self.timelines.remove(timeline_id);
+        if let Some(removed) = removed {
+            resident_metric.sub(
+                removed
+                    .on_disk_layers
+                    .values()
+                    .map(|l| l.metadata.file_size)
+                    .sum(),
+            );
+        }
+    }
+
     /// Additionally returns the total number of layers, used for more stable relative access time
     /// based eviction.
     pub(super) fn get_layers_for_eviction(
@@ -262,6 +345,7 @@ impl scheduler::RunningJob for RunningDownload {
 struct CompleteDownload {
     secondary_state: Arc<SecondaryTenant>,
     completed_at: Instant,
+    result: Result<(), UpdateError>,
 }
 
 impl scheduler::Completion for CompleteDownload {
@@ -286,21 +370,33 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
         let CompleteDownload {
             secondary_state,
             completed_at: _completed_at,
+            result,
         } = completion;
 
         tracing::debug!("Secondary tenant download completed");
 
         let mut detail = secondary_state.detail.lock().unwrap();
 
-        let period = detail
-            .last_download
-            .as_ref()
-            .map(|d| d.upload_period)
-            .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
-
-        // We advance next_download irrespective of errors: we don't want error cases to result in
-        // expensive busy-polling.
-        detail.next_download = Some(Instant::now() + period_jitter(period, 5));
+        match result {
+            Err(UpdateError::Restart) => {
+                // Start downloading again as soon as we can.  This will involve waiting for the scheduler's
+                // scheduling interval.  This slightly reduces the peak download speed of tenants that hit their
+                // deadline and keep restarting, but that also helps give other tenants a chance to execute rather
+                // that letting one big tenant dominate for a long time.
+                detail.next_download = Some(Instant::now());
+            }
+            _ => {
+                let period = detail
+                    .last_download
+                    .as_ref()
+                    .map(|d| d.upload_period)
+                    .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
+
+                // We advance next_download irrespective of errors: we don't want error cases to result in
+                // expensive busy-polling.
+                detail.next_download = Some(Instant::now() + period_jitter(period, 5));
+            }
+        }
     }
 
     async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
@@ -396,9 +492,10 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
         (RunningDownload { barrier }, Box::pin(async move {
             let _completion = completion;
 
-            match TenantDownloader::new(conf, &remote_storage, &secondary_state)
+            let result = TenantDownloader::new(conf, &remote_storage, &secondary_state)
                 .download(&download_ctx)
-                .await
+                .await;
+            match &result
             {
                 Err(UpdateError::NoData) => {
                     tracing::info!("No heatmap found for tenant.  This is fine if it is new.");
@@ -415,6 +512,9 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
                 Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
                     tracing::error!("Error while downloading tenant: {e}");
                 },
+                Err(UpdateError::Restart) => {
+                    tracing::info!("Download reached deadline & will restart to update heatmap")
+                }
                 Ok(()) => {}
             };
 
@@ -436,6 +536,7 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
             CompleteDownload {
                 secondary_state,
                 completed_at: Instant::now(),
+                result
             }
         }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
     }
@@ -452,6 +553,11 @@ struct TenantDownloader<'a> {
 /// Errors that may be encountered while updating a tenant
 #[derive(thiserror::Error, Debug)]
 enum UpdateError {
+    /// This is not a true failure, but it's how a download indicates that it would like to be restarted by
+    /// the scheduler, to pick up the latest heatmap
+    #[error("Reached deadline, restarting downloads")]
+    Restart,
+
     #[error("No remote data found")]
     NoData,
     #[error("Insufficient local storage space")]
@@ -578,8 +684,13 @@ impl<'a> TenantDownloader<'a> {
                 Some(t) => t,
                 None => {
                     // We have no existing state: need to scan local disk for layers first.
-                    let timeline_state =
-                        init_timeline_state(self.conf, tenant_shard_id, timeline).await;
+                    let timeline_state = init_timeline_state(
+                        self.conf,
+                        tenant_shard_id,
+                        timeline,
+                        &self.secondary_state.resident_size_metric,
+                    )
+                    .await;
 
                     // Re-acquire detail lock now that we're done with async load from local FS
                     self.secondary_state
@@ -603,6 +714,26 @@ impl<'a> TenantDownloader<'a> {
                 self.prepare_timelines(&heatmap, heatmap_mtime).await?;
         }
 
+        // Calculate a deadline for downloads: if downloading takes longer than this, it is useful to drop out and start again,
+        // so that we are always using reasonably a fresh heatmap.  Otherwise, if we had really huge content to download, we might
+        // spend 10s of minutes downloading layers we don't need.
+        // (see https://github.com/neondatabase/neon/issues/8182)
+        let deadline = {
+            let period = self
+                .secondary_state
+                .detail
+                .lock()
+                .unwrap()
+                .last_download
+                .as_ref()
+                .map(|d| d.upload_period)
+                .unwrap_or(DEFAULT_DOWNLOAD_INTERVAL);
+
+            // Use double the period: we are not promising to complete within the period, this is just a heuristic
+            // to keep using a "reasonably fresh" heatmap.
+            Instant::now() + period * 2
+        };
+
         // Download the layers in the heatmap
         for timeline in heatmap.timelines {
             let timeline_state = timeline_states
@@ -618,7 +749,7 @@ impl<'a> TenantDownloader<'a> {
             }
 
             let timeline_id = timeline.timeline_id;
-            self.download_timeline(timeline, timeline_state, ctx)
+            self.download_timeline(timeline, timeline_state, deadline, ctx)
                 .instrument(tracing::info_span!(
                     "secondary_download_timeline",
                     tenant_id=%tenant_shard_id.tenant_id,
@@ -628,6 +759,25 @@ impl<'a> TenantDownloader<'a> {
                 .await?;
         }
 
+        // Metrics consistency check in testing builds
+        if cfg!(feature = "testing") {
+            let detail = self.secondary_state.detail.lock().unwrap();
+            let resident_size = detail
+                .timelines
+                .values()
+                .map(|tl| {
+                    tl.on_disk_layers
+                        .values()
+                        .map(|v| v.metadata.file_size)
+                        .sum::<u64>()
+                })
+                .sum::<u64>();
+            assert_eq!(
+                resident_size,
+                self.secondary_state.resident_size_metric.get()
+            );
+        }
+
         // Only update last_etag after a full successful download: this way will not skip
         // the next download, even if the heatmap's actual etag is unchanged.
         self.secondary_state.detail.lock().unwrap().last_download = Some(DownloadSummary {
@@ -740,7 +890,7 @@ impl<'a> TenantDownloader<'a> {
             for delete_timeline in &delete_timelines {
                 // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
                 // from disk fails that will be a fatal error.
-                detail.timelines.remove(delete_timeline);
+                detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric);
             }
         }
 
@@ -758,7 +908,7 @@ impl<'a> TenantDownloader<'a> {
             let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
                 continue;
             };
-            timeline_state.on_disk_layers.remove(&layer_name);
+            timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric);
         }
 
         for timeline_id in delete_timelines {
@@ -827,26 +977,28 @@ impl<'a> TenantDownloader<'a> {
         .and_then(|x| x)
     }
 
-    async fn download_timeline(
+    /// Download heatmap layers that are not present on local disk, or update their
+    /// access time if they are already present.
+    async fn download_timeline_layers(
         &self,
+        tenant_shard_id: &TenantShardId,
         timeline: HeatMapTimeline,
         timeline_state: SecondaryDetailTimeline,
+        deadline: Instant,
         ctx: &RequestContext,
-    ) -> Result<(), UpdateError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
-
+    ) -> (Result<(), UpdateError>, Vec<HeatMapLayer>) {
         // Accumulate updates to the state
         let mut touched = Vec::new();
 
-        tracing::debug!(timeline_id=%timeline.timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
-
-        // Download heatmap layers that are not present on local disk, or update their
-        // access time if they are already present.
         for layer in timeline.layers {
             if self.secondary_state.cancel.is_cancelled() {
                 tracing::debug!("Cancelled -- dropping out of layer loop");
-                return Err(UpdateError::Cancelled);
+                return (Err(UpdateError::Cancelled), touched);
+            }
+
+            if Instant::now() > deadline {
+                // We've been running downloads for a while, restart to download latest heatmap.
+                return (Err(UpdateError::Restart), touched);
             }
 
             // Existing on-disk layers: just update their access time.
@@ -916,52 +1068,66 @@ impl<'a> TenantDownloader<'a> {
 
             match self
                 .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
-                .await?
+                .await
             {
-                Some(layer) => touched.push(layer),
-                None => {
+                Ok(Some(layer)) => touched.push(layer),
+                Ok(None) => {
                     // Not an error but we didn't download it: remote layer is missing.  Don't add it to the list of
                     // things to consider touched.
                 }
+                Err(e) => {
+                    return (Err(e), touched);
+                }
             }
         }
 
-        // Write updates to state to record layers we just downloaded or touched.
+        (Ok(()), touched)
+    }
+
+    async fn download_timeline(
+        &self,
+        timeline: HeatMapTimeline,
+        timeline_state: SecondaryDetailTimeline,
+        deadline: Instant,
+        ctx: &RequestContext,
+    ) -> Result<(), UpdateError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        let timeline_id = timeline.timeline_id;
+
+        tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
+
+        let (result, touched) = self
+            .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
+            .await;
+
+        // Write updates to state to record layers we just downloaded or touched, irrespective of whether the overall result was successful
         {
             let mut detail = self.secondary_state.detail.lock().unwrap();
-            let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
+            let timeline_detail = detail.timelines.entry(timeline_id).or_default();
 
             tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
-
-            for t in touched {
-                use std::collections::hash_map::Entry;
-                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
-                    Entry::Occupied(mut v) => {
-                        v.get_mut().access_time = t.access_time;
-                    }
-                    Entry::Vacant(e) => {
-                        let local_path = local_layer_path(
+            touched.into_iter().for_each(|t| {
+                timeline_detail.touch_layer(
+                    self.conf,
+                    tenant_shard_id,
+                    &timeline_id,
+                    &t,
+                    &self.secondary_state.resident_size_metric,
+                    || {
+                        local_layer_path(
                             self.conf,
                             tenant_shard_id,
-                            &timeline.timeline_id,
+                            &timeline_id,
                             &t.name,
                             &t.metadata.generation,
-                        );
-                        e.insert(OnDiskState::new(
-                            self.conf,
-                            tenant_shard_id,
-                            &timeline.timeline_id,
-                            t.name,
-                            t.metadata.clone(),
-                            t.access_time,
-                            local_path,
-                        ));
-                    }
-                }
-            }
+                        )
+                    },
+                )
+            });
         }
 
-        Ok(())
+        result
     }
 
     /// Call this during timeline download if a layer will _not_ be downloaded, to update progress statistics
@@ -1067,6 +1233,7 @@ async fn init_timeline_state(
     conf: &'static PageServerConf,
     tenant_shard_id: &TenantShardId,
     heatmap: &HeatMapTimeline,
+    resident_metric: &UIntGauge,
 ) -> SecondaryDetailTimeline {
     let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
     let mut detail = SecondaryDetailTimeline::default();
@@ -1142,17 +1309,13 @@ async fn init_timeline_state(
                         } else {
                             // We expect the access time to be initialized immediately afterwards, when
                             // the latest heatmap is applied to the state.
-                            detail.on_disk_layers.insert(
-                                name.clone(),
-                                OnDiskState::new(
-                                    conf,
-                                    tenant_shard_id,
-                                    &heatmap.timeline_id,
-                                    name,
-                                    remote_meta.metadata.clone(),
-                                    remote_meta.access_time,
-                                    file_path,
-                                ),
+                            detail.touch_layer(
+                                conf,
+                                tenant_shard_id,
+                                &heatmap.timeline_id,
+                                remote_meta,
+                                resident_metric,
+                                || file_path,
                             );
                         }
                     }
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index b2338b620ebf..23354417e788 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -3,6 +3,7 @@ use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
+use tenant_size_model::svg::SvgBranchKind;
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -87,6 +88,9 @@ impl SegmentMeta {
             LsnKind::BranchPoint => true,
             LsnKind::GcCutOff => true,
             LsnKind::BranchEnd => false,
+            LsnKind::LeasePoint => true,
+            LsnKind::LeaseStart => false,
+            LsnKind::LeaseEnd => false,
         }
     }
 }
@@ -103,6 +107,21 @@ pub enum LsnKind {
     GcCutOff,
     /// Last record LSN
     BranchEnd,
+    /// A LSN lease is granted here.
+    LeasePoint,
+    /// A lease starts from here.
+    LeaseStart,
+    /// Last record LSN for the lease (should have the same LSN as the previous [`LsnKind::LeaseStart`]).
+    LeaseEnd,
+}
+
+impl From<LsnKind> for SvgBranchKind {
+    fn from(kind: LsnKind) -> Self {
+        match kind {
+            LsnKind::LeasePoint | LsnKind::LeaseStart | LsnKind::LeaseEnd => SvgBranchKind::Lease,
+            _ => SvgBranchKind::Timeline,
+        }
+    }
 }
 
 /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
@@ -124,6 +143,9 @@ pub struct TimelineInputs {
 
     /// Cutoff point calculated from the user-supplied 'max_retention_period'
     retention_param_cutoff: Option<Lsn>,
+
+    /// Lease points on the timeline
+    lease_points: Vec<Lsn>,
 }
 
 /// Gathers the inputs for the tenant sizing model.
@@ -234,6 +256,13 @@ pub(super) async fn gather_inputs(
             None
         };
 
+        let lease_points = gc_info
+            .leases
+            .keys()
+            .filter(|&&lsn| lsn > ancestor_lsn)
+            .copied()
+            .collect::<Vec<_>>();
+
         // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
         // want to query any logical size before initdb_lsn.
         let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
@@ -248,6 +277,8 @@ pub(super) async fn gather_inputs(
             .map(|lsn| (lsn, LsnKind::BranchPoint))
             .collect::<Vec<_>>();
 
+        lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
+
         drop(gc_info);
 
         // Add branch points we collected earlier, just in case there were any that were
@@ -296,6 +327,7 @@ pub(super) async fn gather_inputs(
             if kind == LsnKind::BranchPoint {
                 branchpoint_segments.insert((timeline_id, lsn), segments.len());
             }
+
             segments.push(SegmentMeta {
                 segment: Segment {
                     parent: Some(parent),
@@ -306,7 +338,45 @@ pub(super) async fn gather_inputs(
                 timeline_id: timeline.timeline_id,
                 kind,
             });
-            parent += 1;
+
+            parent = segments.len() - 1;
+
+            if kind == LsnKind::LeasePoint {
+                // Needs `LeaseStart` and `LeaseEnd` as well to model lease as a read-only branch that never writes data
+                // (i.e. it's lsn has not advanced from ancestor_lsn), and therefore the three segments have the same LSN
+                // value. Without the other two segments, the calculation code would not count the leased LSN as a point
+                // to be retained.
+                // Did not use `BranchStart` or `BranchEnd` so we can differentiate branches and leases during debug.
+                //
+                // Alt Design: rewrite the entire calculation code to be independent of timeline id. Both leases and
+                // branch points can be given a synthetic id so we can unite them.
+                let mut lease_parent = parent;
+
+                // Start of a lease.
+                segments.push(SegmentMeta {
+                    segment: Segment {
+                        parent: Some(lease_parent),
+                        lsn: lsn.0,
+                        size: None,                   // Filled in later, if necessary
+                        needed: lsn > next_gc_cutoff, // only needed if the point is within rentention.
+                    },
+                    timeline_id: timeline.timeline_id,
+                    kind: LsnKind::LeaseStart,
+                });
+                lease_parent += 1;
+
+                // End of the lease.
+                segments.push(SegmentMeta {
+                    segment: Segment {
+                        parent: Some(lease_parent),
+                        lsn: lsn.0,
+                        size: None,   // Filled in later, if necessary
+                        needed: true, // everything at the lease LSN must be readable => is needed
+                    },
+                    timeline_id: timeline.timeline_id,
+                    kind: LsnKind::LeaseEnd,
+                });
+            }
         }
 
         // Current end of the timeline
@@ -332,6 +402,7 @@ pub(super) async fn gather_inputs(
             pitr_cutoff,
             next_gc_cutoff,
             retention_param_cutoff,
+            lease_points,
         });
     }
 
@@ -674,7 +745,8 @@ fn verify_size_for_multiple_branches() {
       "horizon_cutoff": "0/2210CD0",
       "pitr_cutoff": "0/2210CD0",
       "next_gc_cutoff": "0/2210CD0",
-      "retention_param_cutoff": null
+      "retention_param_cutoff": null,
+      "lease_points": []
     },
     {
       "timeline_id": "454626700469f0a9914949b9d018e876",
@@ -684,7 +756,8 @@ fn verify_size_for_multiple_branches() {
       "horizon_cutoff": "0/1817770",
       "pitr_cutoff": "0/1817770",
       "next_gc_cutoff": "0/1817770",
-      "retention_param_cutoff": null
+      "retention_param_cutoff": null,
+      "lease_points": []
     },
     {
       "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
@@ -694,7 +767,8 @@ fn verify_size_for_multiple_branches() {
       "horizon_cutoff": "0/18B3D98",
       "pitr_cutoff": "0/18B3D98",
       "next_gc_cutoff": "0/18B3D98",
-      "retention_param_cutoff": null
+      "retention_param_cutoff": null,
+      "lease_points": []
     }
   ]
 }
@@ -749,7 +823,8 @@ fn verify_size_for_one_branch() {
       "horizon_cutoff": "47/240A5860",
       "pitr_cutoff": "47/240A5860",
       "next_gc_cutoff": "47/240A5860",
-      "retention_param_cutoff": "0/0"
+      "retention_param_cutoff": "0/0",
+      "lease_points": []
     }
   ]
 }"#;
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9607546ce0f2..62730f88b260 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -7,6 +7,9 @@ pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
 
+#[cfg(test)]
+pub mod merge_iterator;
+
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
 use crate::task_mgr::TaskKind;
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index bf5d9249ebb5..dfd0196c87e9 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -49,7 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::keyspace::KeySpace;
-use pageserver_api::models::LayerAccessKind;
+use pageserver_api::models::{ImageCompressionAlgorithm, LayerAccessKind};
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -223,6 +223,11 @@ pub struct DeltaLayerInner {
     file: VirtualFile,
     file_id: FileId,
 
+    #[allow(dead_code)]
+    layer_key_range: Range<Key>,
+    #[allow(dead_code)]
+    layer_lsn_range: Range<Lsn>,
+
     max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
 }
 
@@ -452,7 +457,12 @@ impl DeltaLayerWriterInner {
         ctx: &RequestContext,
     ) -> (Vec<u8>, anyhow::Result<()>) {
         assert!(self.lsn_range.start <= lsn);
-        let (val, res) = self.blob_writer.write_blob(val, ctx).await;
+        // We don't want to use compression in delta layer creation
+        let compression = ImageCompressionAlgorithm::Disabled;
+        let (val, res) = self
+            .blob_writer
+            .write_blob_maybe_compressed(val, ctx, compression)
+            .await;
         let off = match res {
             Ok(off) => off,
             Err(e) => return (val, Err(anyhow::anyhow!(e))),
@@ -737,6 +747,16 @@ impl DeltaLayer {
 }
 
 impl DeltaLayerInner {
+    #[cfg(test)]
+    pub(crate) fn key_range(&self) -> &Range<Key> {
+        &self.layer_key_range
+    }
+
+    #[cfg(test)]
+    pub(crate) fn lsn_range(&self) -> &Range<Lsn> {
+        &self.layer_lsn_range
+    }
+
     /// Returns nested result following Result<Result<_, OpErr>, Critical>:
     /// - inner has the success or transient failure
     /// - outer has the permanent failure
@@ -785,6 +805,8 @@ impl DeltaLayerInner {
             index_start_blk: actual_summary.index_start_blk,
             index_root_blk: actual_summary.index_root_blk,
             max_vectored_read_bytes,
+            layer_key_range: actual_summary.key_range,
+            layer_lsn_range: actual_summary.lsn_range,
         }))
     }
 
@@ -1492,6 +1514,24 @@ impl DeltaLayerInner {
         );
         offset
     }
+
+    #[cfg(test)]
+    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
+        let block_reader = FileBlockReader::new(&self.file, self.file_id);
+        let tree_reader =
+            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
+        DeltaLayerIterator {
+            delta_layer: self,
+            ctx,
+            index_iter: tree_reader.iter(&[0; DELTA_KEY_SIZE], ctx),
+            key_values_batch: std::collections::VecDeque::new(),
+            is_end: false,
+            planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner::new(
+                1024 * 8192, // The default value. Unit tests might use a different value. 1024 * 8K = 8MB buffer.
+                1024,        // The default value. Unit tests might use a different value
+            ),
+        }
+    }
 }
 
 /// A set of data associated with a delta layer key and its value
@@ -1552,7 +1592,71 @@ impl<'a> pageserver_compaction::interface::CompactionDeltaEntry<'a, Key> for Del
 }
 
 #[cfg(test)]
-mod test {
+pub struct DeltaLayerIterator<'a> {
+    delta_layer: &'a DeltaLayerInner,
+    ctx: &'a RequestContext,
+    planner: crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner,
+    index_iter: crate::tenant::disk_btree::DiskBtreeIterator<'a>,
+    key_values_batch: std::collections::VecDeque<(Key, Lsn, Value)>,
+    is_end: bool,
+}
+
+#[cfg(test)]
+impl<'a> DeltaLayerIterator<'a> {
+    /// Retrieve a batch of key-value pairs into the iterator buffer.
+    async fn next_batch(&mut self) -> anyhow::Result<()> {
+        assert!(self.key_values_batch.is_empty());
+        assert!(!self.is_end);
+
+        let plan = loop {
+            if let Some(res) = self.index_iter.next().await {
+                let (raw_key, value) = res?;
+                let key = Key::from_slice(&raw_key[..KEY_SIZE]);
+                let lsn = DeltaKey::extract_lsn_from_buf(&raw_key);
+                let blob_ref = BlobRef(value);
+                let offset = blob_ref.pos();
+                if let Some(batch_plan) = self.planner.handle(key, lsn, offset, BlobFlag::None) {
+                    break batch_plan;
+                }
+            } else {
+                self.is_end = true;
+                let data_end_offset = self.delta_layer.index_start_offset();
+                break self.planner.handle_range_end(data_end_offset);
+            }
+        };
+        let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
+        let mut next_batch = std::collections::VecDeque::new();
+        let buf_size = plan.size();
+        let buf = BytesMut::with_capacity(buf_size);
+        let blobs_buf = vectored_blob_reader
+            .read_blobs(&plan, buf, self.ctx)
+            .await?;
+        let frozen_buf = blobs_buf.buf.freeze();
+        for meta in blobs_buf.blobs.iter() {
+            let value = Value::des(&frozen_buf[meta.start..meta.end])?;
+            next_batch.push_back((meta.meta.key, meta.meta.lsn, value));
+        }
+        self.key_values_batch = next_batch;
+        Ok(())
+    }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        if self.key_values_batch.is_empty() {
+            if self.is_end {
+                return Ok(None);
+            }
+            self.next_batch().await?;
+        }
+        Ok(Some(
+            self.key_values_batch
+                .pop_front()
+                .expect("should not be empty"),
+        ))
+    }
+}
+
+#[cfg(test)]
+pub(crate) mod test {
     use std::collections::BTreeMap;
 
     use itertools::MinMaxResult;
@@ -1560,6 +1664,9 @@ mod test {
     use rand::RngCore;
 
     use super::*;
+    use crate::tenant::harness::TIMELINE_ID;
+    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
+    use crate::tenant::Tenant;
     use crate::{
         context::DownloadBehavior,
         task_mgr::TaskKind,
@@ -2126,4 +2233,123 @@ mod test {
             assert_eq!(utils::Hex(&scratch_left), utils::Hex(&scratch_right));
         }
     }
+
+    pub(crate) fn sort_delta(
+        (k1, l1, _): &(Key, Lsn, Value),
+        (k2, l2, _): &(Key, Lsn, Value),
+    ) -> std::cmp::Ordering {
+        (k1, l1).cmp(&(k2, l2))
+    }
+
+    pub(crate) async fn produce_delta_layer(
+        tenant: &Tenant,
+        tline: &Arc<Timeline>,
+        mut deltas: Vec<(Key, Lsn, Value)>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<ResidentLayer> {
+        deltas.sort_by(sort_delta);
+        let (key_start, _, _) = deltas.first().unwrap();
+        let (key_max, _, _) = deltas.first().unwrap();
+        let lsn_min = deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
+        let lsn_max = deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
+        let lsn_end = Lsn(lsn_max.0 + 1);
+        let mut writer = DeltaLayerWriter::new(
+            tenant.conf,
+            tline.timeline_id,
+            tenant.tenant_shard_id,
+            *key_start,
+            (*lsn_min)..lsn_end,
+            ctx,
+        )
+        .await?;
+        let key_end = key_max.next();
+
+        for (key, lsn, value) in deltas {
+            writer.put_value(key, lsn, value, ctx).await?;
+        }
+        let delta_layer = writer.finish(key_end, tline, ctx).await?;
+
+        Ok::<_, anyhow::Error>(delta_layer)
+    }
+
+    async fn assert_delta_iter_equal(
+        delta_iter: &mut DeltaLayerIterator<'_>,
+        expect: &[(Key, Lsn, Value)],
+    ) {
+        let mut expect_iter = expect.iter();
+        loop {
+            let o1 = delta_iter.next().await.unwrap();
+            let o2 = expect_iter.next();
+            assert_eq!(o1.is_some(), o2.is_some());
+            if o1.is_none() && o2.is_none() {
+                break;
+            }
+            let (k1, l1, v1) = o1.unwrap();
+            let (k2, l2, v2) = o2.unwrap();
+            assert_eq!(&k1, k2);
+            assert_eq!(l1, *l2);
+            assert_eq!(&v1, v2);
+        }
+    }
+
+    #[tokio::test]
+    async fn delta_layer_iterator() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("delta_layer_iterator").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        const N: usize = 1000;
+        let test_deltas = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32 / 10),
+                    Lsn(0x10 * ((idx as u64) % 10 + 1)),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer = produce_delta_layer(&tenant, &tline, test_deltas.clone(), &ctx)
+            .await
+            .unwrap();
+        let delta_layer = resident_layer.get_as_delta(&ctx).await.unwrap();
+        for max_read_size in [1, 1024] {
+            for batch_size in [1, 2, 4, 8, 3, 7, 13] {
+                println!("running with batch_size={batch_size} max_read_size={max_read_size}");
+                // Test if the batch size is correctly determined
+                let mut iter = delta_layer.iter(&ctx);
+                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                let mut num_items = 0;
+                for _ in 0..3 {
+                    iter.next_batch().await.unwrap();
+                    num_items += iter.key_values_batch.len();
+                    if max_read_size == 1 {
+                        // every key should be a batch b/c the value is larger than max_read_size
+                        assert_eq!(iter.key_values_batch.len(), 1);
+                    } else {
+                        assert_eq!(iter.key_values_batch.len(), batch_size);
+                    }
+                    if num_items >= N {
+                        break;
+                    }
+                    iter.key_values_batch.clear();
+                }
+                // Test if the result is correct
+                let mut iter = delta_layer.iter(&ctx);
+                iter.planner = StreamingVectoredReadPlanner::new(max_read_size, batch_size);
+                assert_delta_iter_equal(&mut iter, &test_deltas).await;
+            }
+        }
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 50aacbd9ad46..1e03e1a58c92 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -369,6 +369,16 @@ impl ImageLayer {
 }
 
 impl ImageLayerInner {
+    #[cfg(test)]
+    pub(crate) fn key_range(&self) -> &Range<Key> {
+        &self.key_range
+    }
+
+    #[cfg(test)]
+    pub(crate) fn lsn(&self) -> Lsn {
+        self.lsn
+    }
+
     /// Returns nested result following Result<Result<_, OpErr>, Critical>:
     /// - inner has the success or transient failure
     /// - outer has the permanent failure
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 6624fb7e6ba5..5941a52e9825 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -6,13 +6,14 @@
 //!
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
+use crate::page_cache::PAGE_SZ;
 use crate::repository::{Key, Value};
-use crate::tenant::block_io::BlockReader;
+use crate::tenant::block_io::{BlockCursor, BlockReader, BlockReaderRef};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::ValueReconstructResult;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::{PageReconstructError, Timeline};
-use crate::{page_cache, walrecord};
+use crate::{l0_flush, page_cache, walrecord};
 use anyhow::{anyhow, ensure, Result};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
@@ -410,6 +411,7 @@ impl InMemoryLayer {
                 continue;
             }
 
+            // TODO: this uses the page cache => https://github.com/neondatabase/neon/issues/8183
             let buf = reader.read_blob(block_read.block_offset, &ctx).await;
             if let Err(e) = buf {
                 reconstruct_state
@@ -620,6 +622,13 @@ impl InMemoryLayer {
         // rare though, so we just accept the potential latency hit for now.
         let inner = self.inner.read().await;
 
+        let l0_flush_global_state = timeline.l0_flush_global_state.inner().clone();
+        use l0_flush::Inner;
+        let _concurrency_permit = match &*l0_flush_global_state {
+            Inner::PageCached => None,
+            Inner::Direct { semaphore, .. } => Some(semaphore.acquire().await),
+        };
+
         let end_lsn = *self.end_lsn.get().unwrap();
 
         let key_count = if let Some(key_range) = key_range {
@@ -645,28 +654,83 @@ impl InMemoryLayer {
         )
         .await?;
 
-        let mut buf = Vec::new();
-
-        let cursor = inner.file.block_cursor();
-
-        let ctx = RequestContextBuilder::extend(ctx)
-            .page_content_kind(PageContentKind::InMemoryLayer)
-            .build();
-        for (key, vec_map) in inner.index.iter() {
-            // Write all page versions
-            for (lsn, pos) in vec_map.as_slice() {
-                cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
-                let will_init = Value::des(&buf)?.will_init();
-                let res;
-                (buf, res) = delta_layer_writer
-                    .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
-                    .await;
-                res?;
+        match &*l0_flush_global_state {
+            l0_flush::Inner::PageCached => {
+                let ctx = RequestContextBuilder::extend(ctx)
+                    .page_content_kind(PageContentKind::InMemoryLayer)
+                    .build();
+
+                let mut buf = Vec::new();
+
+                let cursor = inner.file.block_cursor();
+
+                for (key, vec_map) in inner.index.iter() {
+                    // Write all page versions
+                    for (lsn, pos) in vec_map.as_slice() {
+                        cursor.read_blob_into_buf(*pos, &mut buf, &ctx).await?;
+                        let will_init = Value::des(&buf)?.will_init();
+                        let res;
+                        (buf, res) = delta_layer_writer
+                            .put_value_bytes(*key, *lsn, buf, will_init, &ctx)
+                            .await;
+                        res?;
+                    }
+                }
+            }
+            l0_flush::Inner::Direct { .. } => {
+                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
+                assert_eq!(
+                    file_contents.len() % PAGE_SZ,
+                    0,
+                    "needed by BlockReaderRef::Slice"
+                );
+                assert_eq!(file_contents.len(), {
+                    let written = usize::try_from(inner.file.len()).unwrap();
+                    if written % PAGE_SZ == 0 {
+                        written
+                    } else {
+                        written.checked_add(PAGE_SZ - (written % PAGE_SZ)).unwrap()
+                    }
+                });
+
+                let cursor = BlockCursor::new(BlockReaderRef::Slice(&file_contents));
+
+                let mut buf = Vec::new();
+
+                for (key, vec_map) in inner.index.iter() {
+                    // Write all page versions
+                    for (lsn, pos) in vec_map.as_slice() {
+                        // TODO: once we have blob lengths in the in-memory index, we can
+                        // 1. get rid of the blob_io / BlockReaderRef::Slice business and
+                        // 2. load the file contents into a Bytes and
+                        // 3. the use `Bytes::slice` to get the `buf` that is our blob
+                        // 4. pass that `buf` into `put_value_bytes`
+                        // => https://github.com/neondatabase/neon/issues/8183
+                        cursor.read_blob_into_buf(*pos, &mut buf, ctx).await?;
+                        let will_init = Value::des(&buf)?.will_init();
+                        let res;
+                        (buf, res) = delta_layer_writer
+                            .put_value_bytes(*key, *lsn, buf, will_init, ctx)
+                            .await;
+                        res?;
+                    }
+                }
             }
         }
 
         // MAX is used here because we identify L0 layers by full key range
-        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, &ctx).await?;
+        let delta_layer = delta_layer_writer.finish(Key::MAX, timeline, ctx).await?;
+
+        // Hold the permit until all the IO is done, including the fsync in `delta_layer_writer.finish()``.
+        //
+        // If we didn't and our caller drops this future, tokio-epoll-uring would extend the lifetime of
+        // the `file_contents: Vec<u8>` until the IO is done, but not the permit's lifetime.
+        // Thus, we'd have more concurrenct `Vec<u8>` in existence than the semaphore allows.
+        //
+        // We hold across the fsync so that on ext4 mounted with data=ordered, all the kernel page cache pages
+        // we dirtied when writing to the filesystem have been flushed and marked !dirty.
+        drop(_concurrency_permit);
+
         Ok(Some(delta_layer))
     }
 }
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 5dd947253578..02069c29d264 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1096,19 +1096,10 @@ impl LayerInner {
 
         match rx.await {
             Ok(Ok(res)) => Ok(res),
-            Ok(Err(e)) => {
-                // sleep already happened in the spawned task, if it was not cancelled
-                match e.downcast_ref::<remote_storage::DownloadError>() {
-                    // If the download failed due to its cancellation token,
-                    // propagate the cancellation error upstream.
-                    Some(remote_storage::DownloadError::Cancelled) => {
-                        Err(DownloadError::DownloadCancelled)
-                    }
-                    // FIXME: this is not embedding the error because historically it would had
-                    // been output to compute, however that is no longer the case.
-                    _ => Err(DownloadError::DownloadFailed),
-                }
+            Ok(Err(remote_storage::DownloadError::Cancelled)) => {
+                Err(DownloadError::DownloadCancelled)
             }
+            Ok(Err(_)) => Err(DownloadError::DownloadFailed),
             Err(_gone) => Err(DownloadError::DownloadCancelled),
         }
     }
@@ -1118,7 +1109,7 @@ impl LayerInner {
         timeline: Arc<Timeline>,
         permit: heavier_once_cell::InitPermit,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<DownloadedLayer>> {
+    ) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
         let result = timeline
             .remote_client
             .download_layer_file(
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
new file mode 100644
index 000000000000..36386c87c999
--- /dev/null
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -0,0 +1,412 @@
+use std::{
+    cmp::Ordering,
+    collections::{binary_heap, BinaryHeap},
+};
+
+use pageserver_api::key::Key;
+use utils::lsn::Lsn;
+
+use crate::{context::RequestContext, repository::Value};
+
+use super::{
+    delta_layer::{DeltaLayerInner, DeltaLayerIterator},
+    image_layer::{ImageLayerInner, ImageLayerIterator},
+};
+
+#[derive(Clone, Copy)]
+enum LayerRef<'a> {
+    Image(&'a ImageLayerInner),
+    Delta(&'a DeltaLayerInner),
+}
+
+impl<'a> LayerRef<'a> {
+    fn iter(self, ctx: &'a RequestContext) -> LayerIterRef<'a> {
+        match self {
+            Self::Image(x) => LayerIterRef::Image(x.iter(ctx)),
+            Self::Delta(x) => LayerIterRef::Delta(x.iter(ctx)),
+        }
+    }
+}
+
+enum LayerIterRef<'a> {
+    Image(ImageLayerIterator<'a>),
+    Delta(DeltaLayerIterator<'a>),
+}
+
+impl LayerIterRef<'_> {
+    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        match self {
+            Self::Delta(x) => x.next().await,
+            Self::Image(x) => x.next().await,
+        }
+    }
+}
+
+/// This type plays several roles at once
+/// 1. Unified iterator for image and delta layers.
+/// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
+/// 3. Lazy creation of the real delta/image iterator.
+enum IteratorWrapper<'a> {
+    NotLoaded {
+        ctx: &'a RequestContext,
+        first_key_lower_bound: (Key, Lsn),
+        layer: LayerRef<'a>,
+    },
+    Loaded {
+        iter: PeekableLayerIterRef<'a>,
+    },
+}
+
+struct PeekableLayerIterRef<'a> {
+    iter: LayerIterRef<'a>,
+    peeked: Option<(Key, Lsn, Value)>, // None == end
+}
+
+impl<'a> PeekableLayerIterRef<'a> {
+    async fn create(mut iter: LayerIterRef<'a>) -> anyhow::Result<Self> {
+        let peeked = iter.next().await?;
+        Ok(Self { iter, peeked })
+    }
+
+    fn peek(&self) -> &Option<(Key, Lsn, Value)> {
+        &self.peeked
+    }
+
+    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        let result = self.peeked.take();
+        self.peeked = self.iter.next().await?;
+        Ok(result)
+    }
+}
+
+impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> {
+    fn eq(&self, other: &Self) -> bool {
+        self.cmp(other) == Ordering::Equal
+    }
+}
+
+impl<'a> std::cmp::Eq for IteratorWrapper<'a> {}
+
+impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        use std::cmp::Ordering;
+        let a = self.peek_next_key_lsn();
+        let b = other.peek_next_key_lsn();
+        match (a, b) {
+            (Some((k1, l1)), Some((k2, l2))) => {
+                let loaded_1 = if self.is_loaded() { 1 } else { 0 };
+                let loaded_2 = if other.is_loaded() { 1 } else { 0 };
+                // When key_lsn are the same, the unloaded iter will always appear before the loaded one.
+                // And note that we do a reverse at the end of the comparison, so it works with the max heap.
+                (k1, l1, loaded_1).cmp(&(k2, l2, loaded_2))
+            }
+            (Some(_), None) => Ordering::Less,
+            (None, Some(_)) => Ordering::Greater,
+            (None, None) => Ordering::Equal,
+        }
+        .reverse()
+    }
+}
+
+impl<'a> IteratorWrapper<'a> {
+    pub fn create_from_image_layer(
+        image_layer: &'a ImageLayerInner,
+        ctx: &'a RequestContext,
+    ) -> Self {
+        Self::NotLoaded {
+            layer: LayerRef::Image(image_layer),
+            first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
+            ctx,
+        }
+    }
+
+    pub fn create_from_delta_layer(
+        delta_layer: &'a DeltaLayerInner,
+        ctx: &'a RequestContext,
+    ) -> Self {
+        Self::NotLoaded {
+            layer: LayerRef::Delta(delta_layer),
+            first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
+            ctx,
+        }
+    }
+
+    fn peek_next_key_lsn(&self) -> Option<(&Key, Lsn)> {
+        match self {
+            Self::Loaded { iter } => iter.peek().as_ref().map(|(key, lsn, _)| (key, *lsn)),
+            Self::NotLoaded {
+                first_key_lower_bound: (key, lsn),
+                ..
+            } => Some((key, *lsn)),
+        }
+    }
+
+    // CORRECTNESS: this function must always take `&mut self`, never `&self`.
+    //
+    // The reason is that `impl Ord for Self` evaluates differently after this function
+    // returns. We're called through a `PeekMut::deref_mut`, which causes heap repair when
+    // the PeekMut gets returned. So, it's critical that we actually run through `PeekMut::deref_mut`
+    // and not just `PeekMut::deref`
+    // If we don't take `&mut self`
+    async fn load(&mut self) -> anyhow::Result<()> {
+        assert!(!self.is_loaded());
+        let Self::NotLoaded {
+            ctx,
+            first_key_lower_bound,
+            layer,
+        } = self
+        else {
+            unreachable!()
+        };
+        let iter = layer.iter(ctx);
+        let iter = PeekableLayerIterRef::create(iter).await?;
+        if let Some((k1, l1, _)) = iter.peek() {
+            let (k2, l2) = first_key_lower_bound;
+            debug_assert!((k1, l1) >= (k2, l2));
+        }
+        *self = Self::Loaded { iter };
+        Ok(())
+    }
+
+    fn is_loaded(&self) -> bool {
+        matches!(self, Self::Loaded { .. })
+    }
+
+    /// Correctness: must load the iterator before using.
+    ///
+    /// Given this iterator wrapper is private to the merge iterator, users won't be able to mis-use it.
+    /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
+    /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
+    async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        let Self::Loaded { iter } = self else {
+            panic!("must load the iterator before using")
+        };
+        iter.next().await
+    }
+}
+
+pub struct MergeIterator<'a> {
+    heap: BinaryHeap<IteratorWrapper<'a>>,
+}
+
+impl<'a> MergeIterator<'a> {
+    pub fn create(
+        deltas: &[&'a DeltaLayerInner],
+        images: &[&'a ImageLayerInner],
+        ctx: &'a RequestContext,
+    ) -> Self {
+        let mut heap = Vec::with_capacity(images.len() + deltas.len());
+        for image in images {
+            heap.push(IteratorWrapper::create_from_image_layer(image, ctx));
+        }
+        for delta in deltas {
+            heap.push(IteratorWrapper::create_from_delta_layer(delta, ctx));
+        }
+        Self {
+            heap: BinaryHeap::from(heap),
+        }
+    }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        while let Some(mut iter) = self.heap.peek_mut() {
+            if !iter.is_loaded() {
+                // Once we load the iterator, we can know the real first key-value pair in the iterator.
+                // We put it back into the heap so that a potentially unloaded layer may have a key between
+                // [potential_first_key, loaded_first_key).
+                iter.load().await?;
+                continue;
+            }
+            let Some(item) = iter.next().await? else {
+                // If the iterator returns None, we pop this iterator. Actually, in the current implementation,
+                // we order None > Some, and all the rest of the iterators should return None.
+                binary_heap::PeekMut::pop(iter);
+                continue;
+            };
+            return Ok(Some(item));
+        }
+        Ok(None)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use itertools::Itertools;
+    use pageserver_api::key::Key;
+    use utils::lsn::Lsn;
+
+    use crate::{
+        tenant::{
+            harness::{TenantHarness, TIMELINE_ID},
+            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
+        },
+        DEFAULT_PG_VERSION,
+    };
+
+    async fn assert_merge_iter_equal(
+        merge_iter: &mut MergeIterator<'_>,
+        expect: &[(Key, Lsn, Value)],
+    ) {
+        let mut expect_iter = expect.iter();
+        loop {
+            let o1 = merge_iter.next().await.unwrap();
+            let o2 = expect_iter.next();
+            assert_eq!(o1.is_some(), o2.is_some());
+            if o1.is_none() && o2.is_none() {
+                break;
+            }
+            let (k1, l1, v1) = o1.unwrap();
+            let (k2, l2, v2) = o2.unwrap();
+            assert_eq!(&k1, k2);
+            assert_eq!(l1, *l2);
+            assert_eq!(&v1, v2);
+        }
+    }
+
+    #[tokio::test]
+    async fn merge_in_between() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        let test_deltas1 = vec![
+            (
+                get_key(0),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+            (
+                get_key(5),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+        ];
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+        let test_deltas2 = vec![
+            (
+                get_key(3),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+            (
+                get_key(4),
+                Lsn(0x10),
+                Value::Image(Bytes::copy_from_slice(b"test")),
+            ),
+        ];
+        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        let mut expect = Vec::new();
+        expect.extend(test_deltas1);
+        expect.extend(test_deltas2);
+        expect.sort_by(sort_delta);
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+    }
+
+    #[tokio::test]
+    async fn delta_merge() {
+        use crate::repository::Value;
+        use bytes::Bytes;
+
+        let harness = TenantHarness::create("merge_iterator_delta_merge").unwrap();
+        let (tenant, ctx) = harness.load().await;
+
+        let tline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("000000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+        const N: usize = 1000;
+        let test_deltas1 = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32 / 10),
+                    Lsn(0x20 * ((idx as u64) % 10 + 1)),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer_1 = produce_delta_layer(&tenant, &tline, test_deltas1.clone(), &ctx)
+            .await
+            .unwrap();
+        let test_deltas2 = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32 / 10),
+                    Lsn(0x20 * ((idx as u64) % 10 + 1) + 0x10),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer_2 = produce_delta_layer(&tenant, &tline, test_deltas2.clone(), &ctx)
+            .await
+            .unwrap();
+        let test_deltas3 = (0..N)
+            .map(|idx| {
+                (
+                    get_key(idx as u32 / 10 + N as u32),
+                    Lsn(0x10 * ((idx as u64) % 10 + 1)),
+                    Value::Image(Bytes::from(format!("img{idx:05}"))),
+                )
+            })
+            .collect_vec();
+        let resident_layer_3 = produce_delta_layer(&tenant, &tline, test_deltas3.clone(), &ctx)
+            .await
+            .unwrap();
+        let mut merge_iter = MergeIterator::create(
+            &[
+                resident_layer_1.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_2.get_as_delta(&ctx).await.unwrap(),
+                resident_layer_3.get_as_delta(&ctx).await.unwrap(),
+            ],
+            &[],
+            &ctx,
+        );
+        let mut expect = Vec::new();
+        expect.extend(test_deltas1);
+        expect.extend(test_deltas2);
+        expect.extend(test_deltas3);
+        expect.sort_by(sort_delta);
+        assert_merge_iter_equal(&mut merge_iter, &expect).await;
+
+        // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
+    }
+
+    // TODO: image layer merge, delta+image mixed merge
+    // TODO: is it possible to have duplicated delta at same LSN now? we might need to test that
+}
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1175b750179d..762e903bf85d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,6 +14,7 @@ use anyhow::{anyhow, bail, ensure, Context, Result};
 use arc_swap::ArcSwap;
 use bytes::Bytes;
 use camino::Utf8Path;
+use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
 use once_cell::sync::Lazy;
@@ -65,13 +66,12 @@ use std::{
     ops::{Deref, Range},
 };
 
-use crate::metrics::GetKind;
-use crate::pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS;
 use crate::{
     aux_file::AuxFileSizeEstimator,
     tenant::{
         layer_map::{LayerMap, SearchResult},
         metadata::TimelineMetadata,
+        storage_layer::PersistentLayerDesc,
     },
 };
 use crate::{
@@ -90,10 +90,15 @@ use crate::{
 use crate::{
     disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
 };
+use crate::{
+    l0_flush::{self, L0FlushGlobalState},
+    metrics::GetKind,
+};
 use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
 use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
 use crate::{
     pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
     virtual_file::{MaybeFatalIo, VirtualFile},
@@ -208,6 +213,7 @@ pub struct TimelineResources {
     pub timeline_get_throttle: Arc<
         crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
     >,
+    pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
 pub(crate) struct AuxFilesState {
@@ -360,6 +366,7 @@ pub struct Timeline {
     repartition_threshold: u64,
 
     last_image_layer_creation_check_at: AtomicLsn,
+    last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,
 
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
@@ -433,6 +440,8 @@ pub struct Timeline {
     /// in the future, add `extra_test_sparse_keyspace` if necessary.
     #[cfg(test)]
     pub(crate) extra_test_dense_keyspace: ArcSwap<KeySpace>,
+
+    pub(crate) l0_flush_global_state: L0FlushGlobalState,
 }
 
 pub struct WalReceiverInfo {
@@ -457,6 +466,9 @@ pub(crate) struct GcInfo {
 
     /// Leases granted to particular LSNs.
     pub(crate) leases: BTreeMap<Lsn, LsnLease>,
+
+    /// Whether our branch point is within our ancestor's PITR interval (for cost estimation)
+    pub(crate) within_ancestor_pitr: bool,
 }
 
 impl GcInfo {
@@ -717,6 +729,9 @@ impl From<CreateImageLayersError> for CompactionError {
     fn from(e: CreateImageLayersError) -> Self {
         match e {
             CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
+            CreateImageLayersError::Other(e) => {
+                CompactionError::Other(e.context("create image layers"))
+            }
             _ => CompactionError::Other(e.into()),
         }
     }
@@ -845,6 +860,18 @@ impl Timeline {
             .map(|ancestor| ancestor.timeline_id)
     }
 
+    /// Get the bytes written since the PITR cutoff on this branch, and
+    /// whether this branch's ancestor_lsn is within its parent's PITR.
+    pub(crate) fn get_pitr_history_stats(&self) -> (u64, bool) {
+        let gc_info = self.gc_info.read().unwrap();
+        let history = self
+            .get_last_record_lsn()
+            .checked_sub(gc_info.cutoffs.pitr)
+            .unwrap_or(Lsn(0))
+            .0;
+        (history, gc_info.within_ancestor_pitr)
+    }
+
     /// Lock and get timeline's GC cutoff
     pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
         self.latest_gc_cutoff_lsn.read()
@@ -996,6 +1023,7 @@ impl Timeline {
     }
 
     pub(crate) const MAX_GET_VECTORED_KEYS: u64 = 32;
+    pub(crate) const VEC_GET_LAYERS_VISITED_WARN_THRESH: f64 = 512.0;
 
     /// Look up multiple page versions at a given LSN
     ///
@@ -1228,7 +1256,7 @@ impl Timeline {
         let get_data_timer = crate::metrics::GET_RECONSTRUCT_DATA_TIME
             .for_get_kind(get_kind)
             .start_timer();
-        self.get_vectored_reconstruct_data(keyspace, lsn, reconstruct_state, ctx)
+        self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
             .await?;
         get_data_timer.stop_and_record();
 
@@ -1258,11 +1286,25 @@ impl Timeline {
         // (this is a requirement, not a bug). Skip updating the metric in these cases
         // to avoid infinite results.
         if !results.is_empty() {
+            let avg = layers_visited as f64 / results.len() as f64;
+            if avg >= Self::VEC_GET_LAYERS_VISITED_WARN_THRESH {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
+                let mut rate_limit = LOGGED.lock().unwrap();
+                rate_limit.call(|| {
+                    tracing::info!(
+                      shard_id = %self.tenant_shard_id.shard_slug(),
+                      lsn = %lsn,
+                      "Vectored read for {} visited {} layers on average per key and {} in total. {}/{} pages were returned",
+                      keyspace, avg, layers_visited, results.len(), keyspace.total_raw_size());
+                });
+            }
+
             // Note that this is an approximation. Tracking the exact number of layers visited
             // per key requires virtually unbounded memory usage and is inefficient
             // (i.e. segment tree tracking each range queried from a layer)
-            crate::metrics::VEC_READ_NUM_LAYERS_VISITED
-                .observe(layers_visited as f64 / results.len() as f64);
+            crate::metrics::VEC_READ_NUM_LAYERS_VISITED.observe(avg);
         }
 
         Ok(results)
@@ -1554,7 +1596,13 @@ impl Timeline {
                     let existing_lease = occupied.get_mut();
                     if valid_until > existing_lease.valid_until {
                         existing_lease.valid_until = valid_until;
+                        let dt: DateTime<Utc> = valid_until.into();
+                        info!("lease extended to {}", dt);
+                    } else {
+                        let dt: DateTime<Utc> = existing_lease.valid_until.into();
+                        info!("existing lease covers greater length, valid until {}", dt);
                     }
+
                     existing_lease.clone()
                 } else {
                     // Reject already GC-ed LSN (lsn < latest_gc_cutoff)
@@ -1563,6 +1611,8 @@ impl Timeline {
                         bail!("tried to request a page version that was garbage collected. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
                     }
 
+                    let dt: DateTime<Utc> = valid_until.into();
+                    info!("lease created, valid until {}", dt);
                     entry.or_insert(LsnLease { valid_until }).clone()
                 }
             };
@@ -2339,6 +2389,7 @@ impl Timeline {
                 )),
                 repartition_threshold: 0,
                 last_image_layer_creation_check_at: AtomicLsn::new(0),
+                last_image_layer_creation_check_instant: Mutex::new(None),
 
                 last_received_wal: Mutex::new(None),
                 rel_size_cache: RwLock::new(RelSizeCache {
@@ -2376,6 +2427,8 @@ impl Timeline {
 
                 #[cfg(test)]
                 extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
+
+                l0_flush_global_state: resources.l0_flush_global_state,
             };
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -4417,6 +4470,58 @@ impl Timeline {
         }
     }
 
+    /// Predicate function which indicates whether we should check if new image layers
+    /// are required. Since checking if new image layers are required is expensive in
+    /// terms of CPU, we only do it in the following cases:
+    /// 1. If the timeline has ingested sufficient WAL to justify the cost
+    /// 2. If enough time has passed since the last check
+    /// 2.1. For large tenants, we wish to perform the check more often since they
+    /// suffer from the lack of image layers
+    /// 2.2. For small tenants (that can mostly fit in RAM), we use a much longer interval
+    fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
+        const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
+
+        let last_checks_at = self.last_image_layer_creation_check_at.load();
+        let distance = lsn
+            .checked_sub(last_checks_at)
+            .expect("Attempt to compact with LSN going backwards");
+        let min_distance =
+            self.get_image_layer_creation_check_threshold() as u64 * self.get_checkpoint_distance();
+
+        let distance_based_decision = distance.0 >= min_distance;
+
+        let mut time_based_decision = false;
+        let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
+        if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
+            let check_required_after = if Into::<u64>::into(&logical_size) >= LARGE_TENANT_THRESHOLD
+            {
+                self.get_checkpoint_timeout()
+            } else {
+                Duration::from_secs(3600 * 48)
+            };
+
+            time_based_decision = match *last_check_instant {
+                Some(last_check) => {
+                    let elapsed = last_check.elapsed();
+                    elapsed >= check_required_after
+                }
+                None => true,
+            };
+        }
+
+        // Do the expensive delta layer counting only if this timeline has ingested sufficient
+        // WAL since the last check or a checkpoint timeout interval has elapsed since the last
+        // check.
+        let decision = distance_based_decision || time_based_decision;
+
+        if decision {
+            self.last_image_layer_creation_check_at.store(lsn);
+            *last_check_instant = Some(Instant::now());
+        }
+
+        decision
+    }
+
     #[tracing::instrument(skip_all, fields(%lsn, %mode))]
     async fn create_image_layers(
         self: &Arc<Timeline>,
@@ -4439,22 +4544,7 @@ impl Timeline {
         // image layers  <100000000..100000099> and <200000000..200000199> are not completely covering it.
         let mut start = Key::MIN;
 
-        let check_for_image_layers = {
-            let last_checks_at = self.last_image_layer_creation_check_at.load();
-            let distance = lsn
-                .checked_sub(last_checks_at)
-                .expect("Attempt to compact with LSN going backwards");
-            let min_distance = self.get_image_layer_creation_check_threshold() as u64
-                * self.get_checkpoint_distance();
-
-            // Skip the expensive delta layer counting if this timeline has not ingested sufficient
-            // WAL since the last check.
-            distance.0 >= min_distance
-        };
-
-        if check_for_image_layers {
-            self.last_image_layer_creation_check_at.store(lsn);
-        }
+        let check_for_image_layers = self.should_check_if_image_layers_required(lsn);
 
         for partition in partitioning.parts.iter() {
             let img_range = start..partition.ranges.last().unwrap().end;
@@ -4483,6 +4573,22 @@ impl Timeline {
                     start = img_range.end;
                     continue;
                 }
+            } else if let ImageLayerCreationMode::Force = mode {
+                // When forced to create image layers, we might try and create them where they already
+                // exist.  This mode is only used in tests/debug.
+                let layers = self.layers.read().await;
+                if layers.contains_key(&PersistentLayerKey {
+                    key_range: img_range.clone(),
+                    lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
+                    is_delta: false,
+                }) {
+                    tracing::info!(
+                        "Skipping image layer at {lsn} {}..{}, already exists",
+                        img_range.start,
+                        img_range.end
+                    );
+                    continue;
+                }
             }
 
             let image_layer_writer = ImageLayerWriter::new(
@@ -4711,6 +4817,42 @@ impl DurationRecorder {
     }
 }
 
+/// Descriptor for a delta layer used in testing infra. The start/end key/lsn range of the
+/// delta layer might be different from the min/max key/lsn in the delta layer. Therefore,
+/// the layer descriptor requires the user to provide the ranges, which should cover all
+/// keys specified in the `data` field.
+#[cfg(test)]
+pub struct DeltaLayerTestDesc {
+    pub lsn_range: Range<Lsn>,
+    pub key_range: Range<Key>,
+    pub data: Vec<(Key, Lsn, Value)>,
+}
+
+#[cfg(test)]
+impl DeltaLayerTestDesc {
+    #[allow(dead_code)]
+    pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
+        Self {
+            lsn_range,
+            key_range,
+            data,
+        }
+    }
+
+    pub fn new_with_inferred_key_range(
+        lsn_range: Range<Lsn>,
+        data: Vec<(Key, Lsn, Value)>,
+    ) -> Self {
+        let key_min = data.iter().map(|(key, _, _)| key).min().unwrap();
+        let key_max = data.iter().map(|(key, _, _)| key).max().unwrap();
+        Self {
+            key_range: (*key_min)..(key_max.next()),
+            lsn_range,
+            data,
+        }
+    }
+}
+
 impl Timeline {
     async fn finish_compact_batch(
         self: &Arc<Self>,
@@ -5481,12 +5623,12 @@ impl Timeline {
         }
         images.sort_unstable_by(|(ka, _), (kb, _)| ka.cmp(kb));
         let min_key = *images.first().map(|(k, _)| k).unwrap();
-        let max_key = images.last().map(|(k, _)| k).unwrap().next();
+        let end_key = images.last().map(|(k, _)| k).unwrap().next();
         let mut image_layer_writer = ImageLayerWriter::new(
             self.conf,
             self.timeline_id,
             self.tenant_shard_id,
-            &(min_key..max_key),
+            &(min_key..end_key),
             lsn,
             ctx,
         )
@@ -5511,37 +5653,65 @@ impl Timeline {
     #[cfg(test)]
     pub(super) async fn force_create_delta_layer(
         self: &Arc<Timeline>,
-        mut deltas: Vec<(Key, Lsn, Value)>,
+        mut deltas: DeltaLayerTestDesc,
         check_start_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let last_record_lsn = self.get_last_record_lsn();
-        deltas.sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
-        let min_key = *deltas.first().map(|(k, _, _)| k).unwrap();
-        let max_key = deltas.last().map(|(k, _, _)| k).unwrap().next();
-        let min_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).min().unwrap();
-        let max_lsn = *deltas.iter().map(|(_, lsn, _)| lsn).max().unwrap();
+        deltas
+            .data
+            .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+        assert!(deltas.data.first().unwrap().0 >= deltas.key_range.start);
+        assert!(deltas.data.last().unwrap().0 < deltas.key_range.end);
+        for (_, lsn, _) in &deltas.data {
+            assert!(deltas.lsn_range.start <= *lsn && *lsn < deltas.lsn_range.end);
+        }
         assert!(
-            max_lsn <= last_record_lsn,
-            "advance last record lsn before inserting a layer, max_lsn={max_lsn}, last_record_lsn={last_record_lsn}"
+            deltas.lsn_range.end <= last_record_lsn,
+            "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
+            deltas.lsn_range.end,
+            last_record_lsn
         );
-        let end_lsn = Lsn(max_lsn.0 + 1);
         if let Some(check_start_lsn) = check_start_lsn {
-            assert!(min_lsn >= check_start_lsn);
+            assert!(deltas.lsn_range.start >= check_start_lsn);
+        }
+        // check if the delta layer does not violate the LSN invariant, the legacy compaction should always produce a batch of
+        // layers of the same start/end LSN, and so should the force inserted layer
+        {
+            /// Checks if a overlaps with b, assume a/b = [start, end).
+            pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+                !(a.end <= b.start || b.end <= a.start)
+            }
+
+            let guard = self.layers.read().await;
+            for layer in guard.layer_map().iter_historic_layers() {
+                if layer.is_delta()
+                    && overlaps_with(&layer.lsn_range, &deltas.lsn_range)
+                    && layer.lsn_range != deltas.lsn_range
+                {
+                    // If a delta layer overlaps with another delta layer AND their LSN range is not the same, panic
+                    panic!(
+                        "inserted layer violates delta layer LSN invariant: current_lsn_range={}..{}, conflict_lsn_range={}..{}",
+                        deltas.lsn_range.start, deltas.lsn_range.end, layer.lsn_range.start, layer.lsn_range.end
+                    );
+                }
+            }
         }
         let mut delta_layer_writer = DeltaLayerWriter::new(
             self.conf,
             self.timeline_id,
             self.tenant_shard_id,
-            min_key,
-            min_lsn..end_lsn,
+            deltas.key_range.start,
+            deltas.lsn_range,
             ctx,
         )
         .await?;
-        for (key, lsn, val) in deltas {
+        for (key, lsn, val) in deltas.data {
             delta_layer_writer.put_value(key, lsn, val, ctx).await?;
         }
-        let delta_layer = delta_layer_writer.finish(max_key, self, ctx).await?;
+        let delta_layer = delta_layer_writer
+            .finish(deltas.key_range.end, self, ctx)
+            .await?;
 
         {
             let mut guard = self.layers.write().await;
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 6d747d424dde..b0088f4ea228 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -272,6 +272,7 @@ impl DeleteTimelineFlow {
                 TimelineResources {
                     remote_client,
                     timeline_get_throttle: tenant.timeline_get_throttle.clone(),
+                    l0_flush_global_state: tenant.l0_flush_global_state.clone(),
                 },
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 948237e06a5e..a43ff873acb5 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -339,6 +339,10 @@ impl LayerManager {
         self.layer_fmgr.contains(layer)
     }
 
+    pub(crate) fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.layer_fmgr.contains_key(key)
+    }
+
     pub(crate) fn all_persistent_layers(&self) -> Vec<PersistentLayerKey> {
         self.layer_fmgr.0.keys().cloned().collect_vec()
     }
@@ -363,6 +367,10 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
             .clone()
     }
 
+    fn contains_key(&self, key: &PersistentLayerKey) -> bool {
+        self.0.contains_key(key)
+    }
+
     pub(crate) fn insert(&mut self, layer: T) {
         let present = self.0.insert(layer.layer_desc().key(), layer.clone());
         if present.is_some() && cfg!(debug_assertions) {
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index c6ee6b90c4d1..a66900522af4 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -26,7 +26,7 @@ use tracing::{debug, error, info, trace, warn, Instrument};
 use super::TaskStateUpdate;
 use crate::{
     context::RequestContext,
-    metrics::{LIVE_CONNECTIONS_COUNT, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
+    metrics::{LIVE_CONNECTIONS, WALRECEIVER_STARTED_CONNECTIONS, WAL_INGEST},
     task_mgr::TaskKind,
     task_mgr::WALRECEIVER_RUNTIME,
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
@@ -208,14 +208,9 @@ pub(super) async fn handle_walreceiver_connection(
         .instrument(tracing::info_span!("poller")),
     );
 
-    // Immediately increment the gauge, then create a job to decrement it on task exit.
-    // One of the pros of `defer!` is that this will *most probably*
-    // get called, even in presence of panics.
-    let gauge = LIVE_CONNECTIONS_COUNT.with_label_values(&["wal_receiver"]);
-    gauge.inc();
-    scopeguard::defer! {
-        gauge.dec();
-    }
+    let _guard = LIVE_CONNECTIONS
+        .with_label_values(&["wal_receiver"])
+        .guard();
 
     let identify = identify_system(&replication_client).await?;
     info!("{identify:?}");
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 1241a1390209..7ad8446e0411 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -20,6 +20,7 @@ use std::num::NonZeroUsize;
 
 use bytes::BytesMut;
 use pageserver_api::key::Key;
+use tokio_epoll_uring::BoundedBuf;
 use utils::lsn::Lsn;
 use utils::vec_map::VecMap;
 
@@ -316,8 +317,9 @@ impl<'a> VectoredBlobReader<'a> {
         );
         let buf = self
             .file
-            .read_exact_at_n(buf, read.start, read.size(), ctx)
-            .await?;
+            .read_exact_at(buf.slice(0..read.size()), read.start, ctx)
+            .await?
+            .into_inner();
 
         let blobs_at = read.blobs_at.as_slice();
         let start_offset = blobs_at.first().expect("VectoredRead is never empty").0;
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 04d9386fab92..51b0c420c346 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -13,7 +13,7 @@
 use crate::context::RequestContext;
 use crate::metrics::{StorageIoOperation, STORAGE_IO_SIZE, STORAGE_IO_TIME_METRIC};
 
-use crate::page_cache::PageWriteGuard;
+use crate::page_cache::{PageWriteGuard, PAGE_SZ};
 use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
@@ -48,6 +48,7 @@ pub(crate) mod owned_buffers_io {
     //! but for the time being we're proving out the primitives in the neon.git repo
     //! for faster iteration.
 
+    pub(crate) mod slice;
     pub(crate) mod write;
     pub(crate) mod util {
         pub(crate) mod size_tracking_writer;
@@ -143,16 +144,17 @@ struct SlotInner {
 /// Impl of [`tokio_epoll_uring::IoBuf`] and [`tokio_epoll_uring::IoBufMut`] for [`PageWriteGuard`].
 struct PageWriteGuardBuf {
     page: PageWriteGuard<'static>,
-    init_up_to: usize,
 }
 // Safety: the [`PageWriteGuard`] gives us exclusive ownership of the page cache slot,
 // and the location remains stable even if [`Self`] or the [`PageWriteGuard`] is moved.
+// Page cache pages are zero-initialized, so, wrt uninitialized memory we're good.
+// (Page cache tracks separately whether the contents are valid, see `PageWriteGuard::mark_valid`.)
 unsafe impl tokio_epoll_uring::IoBuf for PageWriteGuardBuf {
     fn stable_ptr(&self) -> *const u8 {
         self.page.as_ptr()
     }
     fn bytes_init(&self) -> usize {
-        self.init_up_to
+        self.page.len()
     }
     fn bytes_total(&self) -> usize {
         self.page.len()
@@ -166,8 +168,8 @@ unsafe impl tokio_epoll_uring::IoBufMut for PageWriteGuardBuf {
     }
 
     unsafe fn set_init(&mut self, pos: usize) {
+        // There shouldn't really be any reason to call this API since bytes_init() == bytes_total().
         assert!(pos <= self.page.len());
-        self.init_up_to = pos;
     }
 }
 
@@ -585,37 +587,37 @@ impl VirtualFile {
         Ok(self.pos)
     }
 
-    pub async fn read_exact_at<B>(
+    /// Read the file contents in range `offset..(offset + slice.bytes_total())` into `slice[0..slice.bytes_total()]`.
+    ///
+    /// The returned `Slice<Buf>` is equivalent to the input `slice`, i.e., it's the same view into the same buffer.
+    pub async fn read_exact_at<Buf>(
         &self,
-        buf: B,
+        slice: Slice<Buf>,
         offset: u64,
         ctx: &RequestContext,
-    ) -> Result<B, Error>
+    ) -> Result<Slice<Buf>, Error>
     where
-        B: IoBufMut + Send,
+        Buf: IoBufMut + Send,
     {
-        let (buf, res) = read_exact_at_impl(buf, offset, None, |buf, offset| {
-            self.read_at(buf, offset, ctx)
-        })
-        .await;
-        res.map(|()| buf)
-    }
+        let assert_we_return_original_bounds = if cfg!(debug_assertions) {
+            Some((slice.stable_ptr() as usize, slice.bytes_total()))
+        } else {
+            None
+        };
 
-    pub async fn read_exact_at_n<B>(
-        &self,
-        buf: B,
-        offset: u64,
-        count: usize,
-        ctx: &RequestContext,
-    ) -> Result<B, Error>
-    where
-        B: IoBufMut + Send,
-    {
-        let (buf, res) = read_exact_at_impl(buf, offset, Some(count), |buf, offset| {
-            self.read_at(buf, offset, ctx)
-        })
-        .await;
-        res.map(|()| buf)
+        let original_bounds = slice.bounds();
+        let (buf, res) =
+            read_exact_at_impl(slice, offset, |buf, offset| self.read_at(buf, offset, ctx)).await;
+        let res = res.map(|_| buf.slice(original_bounds));
+
+        if let Some(original_bounds) = assert_we_return_original_bounds {
+            if let Ok(slice) = &res {
+                let returned_bounds = (slice.stable_ptr() as usize, slice.bytes_total());
+                assert_eq!(original_bounds, returned_bounds);
+            }
+        }
+
+        res
     }
 
     /// Like [`Self::read_exact_at`] but for [`PageWriteGuard`].
@@ -625,13 +627,11 @@ impl VirtualFile {
         offset: u64,
         ctx: &RequestContext,
     ) -> Result<PageWriteGuard<'static>, Error> {
-        let buf = PageWriteGuardBuf {
-            page,
-            init_up_to: 0,
-        };
-        let res = self.read_exact_at(buf, offset, ctx).await;
-        res.map(|PageWriteGuardBuf { page, .. }| page)
-            .map_err(|e| Error::new(ErrorKind::Other, e))
+        let buf = PageWriteGuardBuf { page }.slice_full();
+        debug_assert_eq!(buf.bytes_total(), PAGE_SZ);
+        self.read_exact_at(buf, offset, ctx)
+            .await
+            .map(|slice| slice.into_inner().page)
     }
 
     // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
@@ -722,14 +722,14 @@ impl VirtualFile {
         (buf, Ok(n))
     }
 
-    pub(crate) async fn read_at<B>(
+    pub(crate) async fn read_at<Buf>(
         &self,
-        buf: B,
+        buf: tokio_epoll_uring::Slice<Buf>,
         offset: u64,
         _ctx: &RequestContext, /* TODO: use for metrics: https://github.com/neondatabase/neon/issues/6107 */
-    ) -> (B, Result<usize, Error>)
+    ) -> (tokio_epoll_uring::Slice<Buf>, Result<usize, Error>)
     where
-        B: tokio_epoll_uring::BoundedBufMut + Send,
+        Buf: tokio_epoll_uring::IoBufMut + Send,
     {
         let file_guard = match self.lock_file().await {
             Ok(file_guard) => file_guard,
@@ -781,26 +781,16 @@ impl VirtualFile {
 }
 
 // Adapted from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#117-135
-pub async fn read_exact_at_impl<B, F, Fut>(
-    buf: B,
+pub async fn read_exact_at_impl<Buf, F, Fut>(
+    mut buf: tokio_epoll_uring::Slice<Buf>,
     mut offset: u64,
-    count: Option<usize>,
     mut read_at: F,
-) -> (B, std::io::Result<()>)
+) -> (Buf, std::io::Result<()>)
 where
-    B: IoBufMut + Send,
-    F: FnMut(tokio_epoll_uring::Slice<B>, u64) -> Fut,
-    Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<B>, std::io::Result<usize>)>,
+    Buf: IoBufMut + Send,
+    F: FnMut(tokio_epoll_uring::Slice<Buf>, u64) -> Fut,
+    Fut: std::future::Future<Output = (tokio_epoll_uring::Slice<Buf>, std::io::Result<usize>)>,
 {
-    let mut buf: tokio_epoll_uring::Slice<B> = match count {
-        Some(count) => {
-            assert!(count <= buf.bytes_total());
-            assert!(count > 0);
-            buf.slice(..count) // may include uninitialized memory
-        }
-        None => buf.slice_full(), // includes all the uninitialized memory
-    };
-
     while buf.bytes_total() != 0 {
         let res;
         (buf, res) = read_at(buf, offset).await;
@@ -882,7 +872,7 @@ mod test_read_exact_at_impl {
 
     #[tokio::test]
     async fn test_basic() {
-        let buf = Vec::with_capacity(5);
+        let buf = Vec::with_capacity(5).slice_full();
         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
             expectations: VecDeque::from(vec![Expectation {
                 offset: 0,
@@ -890,7 +880,7 @@ mod test_read_exact_at_impl {
                 result: Ok(vec![b'a', b'b', b'c', b'd', b'e']),
             }]),
         }));
-        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -899,33 +889,13 @@ mod test_read_exact_at_impl {
         assert_eq!(buf, vec![b'a', b'b', b'c', b'd', b'e']);
     }
 
-    #[tokio::test]
-    async fn test_with_count() {
-        let buf = Vec::with_capacity(5);
-        let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
-            expectations: VecDeque::from(vec![Expectation {
-                offset: 0,
-                bytes_total: 3,
-                result: Ok(vec![b'a', b'b', b'c']),
-            }]),
-        }));
-
-        let (buf, res) = read_exact_at_impl(buf, 0, Some(3), |buf, offset| {
-            let mock_read_at = Arc::clone(&mock_read_at);
-            async move { mock_read_at.lock().await.read_at(buf, offset).await }
-        })
-        .await;
-        assert!(res.is_ok());
-        assert_eq!(buf, vec![b'a', b'b', b'c']);
-    }
-
     #[tokio::test]
     async fn test_empty_buf_issues_no_syscall() {
-        let buf = Vec::new();
+        let buf = Vec::new().slice_full();
         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
             expectations: VecDeque::new(),
         }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -935,7 +905,7 @@ mod test_read_exact_at_impl {
 
     #[tokio::test]
     async fn test_two_read_at_calls_needed_until_buf_filled() {
-        let buf = Vec::with_capacity(4);
+        let buf = Vec::with_capacity(4).slice_full();
         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
             expectations: VecDeque::from(vec![
                 Expectation {
@@ -950,7 +920,7 @@ mod test_read_exact_at_impl {
                 },
             ]),
         }));
-        let (buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
+        let (buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -961,7 +931,7 @@ mod test_read_exact_at_impl {
 
     #[tokio::test]
     async fn test_eof_before_buffer_full() {
-        let buf = Vec::with_capacity(3);
+        let buf = Vec::with_capacity(3).slice_full();
         let mock_read_at = Arc::new(tokio::sync::Mutex::new(MockReadAt {
             expectations: VecDeque::from(vec![
                 Expectation {
@@ -981,7 +951,7 @@ mod test_read_exact_at_impl {
                 },
             ]),
         }));
-        let (_buf, res) = read_exact_at_impl(buf, 0, None, |buf, offset| {
+        let (_buf, res) = read_exact_at_impl(buf, 0, |buf, offset| {
             let mock_read_at = Arc::clone(&mock_read_at);
             async move { mock_read_at.lock().await.read_at(buf, offset).await }
         })
@@ -1051,27 +1021,29 @@ impl VirtualFile {
         ctx: &RequestContext,
     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
         use crate::page_cache::PAGE_SZ;
-        let buf = vec![0; PAGE_SZ];
-        let buf = self
-            .read_exact_at(buf, blknum as u64 * (PAGE_SZ as u64), ctx)
+        let slice = Vec::with_capacity(PAGE_SZ).slice_full();
+        assert_eq!(slice.bytes_total(), PAGE_SZ);
+        let slice = self
+            .read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx)
             .await?;
-        Ok(crate::tenant::block_io::BlockLease::Vec(buf))
+        Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner()))
     }
 
     async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
         let mut tmp = vec![0; 128];
         loop {
-            let res;
-            (tmp, res) = self.read_at(tmp, self.pos, ctx).await;
+            let slice = tmp.slice(..128);
+            let (slice, res) = self.read_at(slice, self.pos, ctx).await;
             match res {
                 Ok(0) => return Ok(()),
                 Ok(n) => {
                     self.pos += n as u64;
-                    buf.extend_from_slice(&tmp[..n]);
+                    buf.extend_from_slice(&slice[..n]);
                 }
                 Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
                 Err(e) => return Err(e),
             }
+            tmp = slice.into_inner();
         }
     }
 }
@@ -1185,6 +1157,7 @@ mod tests {
     use crate::task_mgr::TaskKind;
 
     use super::*;
+    use owned_buffers_io::slice::SliceExt;
     use rand::seq::SliceRandom;
     use rand::thread_rng;
     use rand::Rng;
@@ -1206,13 +1179,16 @@ mod tests {
     impl MaybeVirtualFile {
         async fn read_exact_at(
             &self,
-            mut buf: Vec<u8>,
+            mut slice: tokio_epoll_uring::Slice<Vec<u8>>,
             offset: u64,
             ctx: &RequestContext,
-        ) -> Result<Vec<u8>, Error> {
+        ) -> Result<tokio_epoll_uring::Slice<Vec<u8>>, Error> {
             match self {
-                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(buf, offset, ctx).await,
-                MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
+                MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await,
+                MaybeVirtualFile::File(file) => {
+                    let rust_slice: &mut [u8] = slice.as_mut_rust_slice_full_zeroed();
+                    file.read_exact_at(rust_slice, offset).map(|()| slice)
+                }
             }
         }
         async fn write_all_at<B: BoundedBuf<Buf = Buf>, Buf: IoBuf + Send>(
@@ -1286,9 +1262,12 @@ mod tests {
             len: usize,
             ctx: &RequestContext,
         ) -> Result<String, Error> {
-            let buf = vec![0; len];
-            let buf = self.read_exact_at(buf, pos, ctx).await?;
-            Ok(String::from_utf8(buf).unwrap())
+            let slice = Vec::with_capacity(len).slice_full();
+            assert_eq!(slice.bytes_total(), len);
+            let slice = self.read_exact_at(slice, pos, ctx).await?;
+            let vec = slice.into_inner();
+            assert_eq!(vec.len(), len);
+            Ok(String::from_utf8(vec).unwrap())
         }
     }
 
@@ -1507,7 +1486,11 @@ mod tests {
                 let mut rng = rand::rngs::OsRng;
                 for _ in 1..1000 {
                     let f = &files[rng.gen_range(0..files.len())];
-                    buf = f.read_exact_at(buf, 0, &ctx).await.unwrap();
+                    buf = f
+                        .read_exact_at(buf.slice_full(), 0, &ctx)
+                        .await
+                        .unwrap()
+                        .into_inner();
                     assert!(buf == SAMPLE);
                 }
             });
diff --git a/pageserver/src/virtual_file/io_engine.rs b/pageserver/src/virtual_file/io_engine.rs
index 7a27be2ca108..2820cea097d1 100644
--- a/pageserver/src/virtual_file/io_engine.rs
+++ b/pageserver/src/virtual_file/io_engine.rs
@@ -107,7 +107,7 @@ use std::{
     sync::atomic::{AtomicU8, Ordering},
 };
 
-use super::{FileGuard, Metadata};
+use super::{owned_buffers_io::slice::SliceExt, FileGuard, Metadata};
 
 #[cfg(target_os = "linux")]
 fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std::io::Error {
@@ -120,38 +120,29 @@ fn epoll_uring_error_to_std(e: tokio_epoll_uring::Error<std::io::Error>) -> std:
 }
 
 impl IoEngine {
-    pub(super) async fn read_at<B>(
+    pub(super) async fn read_at<Buf>(
         &self,
         file_guard: FileGuard,
         offset: u64,
-        mut buf: B,
-    ) -> ((FileGuard, B), std::io::Result<usize>)
+        mut slice: tokio_epoll_uring::Slice<Buf>,
+    ) -> (
+        (FileGuard, tokio_epoll_uring::Slice<Buf>),
+        std::io::Result<usize>,
+    )
     where
-        B: tokio_epoll_uring::BoundedBufMut + Send,
+        Buf: tokio_epoll_uring::IoBufMut + Send,
     {
         match self {
             IoEngine::NotSet => panic!("not initialized"),
             IoEngine::StdFs => {
-                // SAFETY: `dst` only lives at most as long as this match arm, during which buf remains valid memory.
-                let dst = unsafe {
-                    std::slice::from_raw_parts_mut(buf.stable_mut_ptr(), buf.bytes_total())
-                };
-                let res = file_guard.with_std_file(|std_file| std_file.read_at(dst, offset));
-                if let Ok(nbytes) = &res {
-                    assert!(*nbytes <= buf.bytes_total());
-                    // SAFETY: see above assertion
-                    unsafe {
-                        buf.set_init(*nbytes);
-                    }
-                }
-                #[allow(dropping_references)]
-                drop(dst);
-                ((file_guard, buf), res)
+                let rust_slice = slice.as_mut_rust_slice_full_zeroed();
+                let res = file_guard.with_std_file(|std_file| std_file.read_at(rust_slice, offset));
+                ((file_guard, slice), res)
             }
             #[cfg(target_os = "linux")]
             IoEngine::TokioEpollUring => {
                 let system = tokio_epoll_uring_ext::thread_local_system().await;
-                let (resources, res) = system.read(file_guard, offset, buf).await;
+                let (resources, res) = system.read(file_guard, offset, slice).await;
                 (resources, res.map_err(epoll_uring_error_to_std))
             }
         }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/slice.rs
new file mode 100644
index 000000000000..d19e5ddffefb
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/slice.rs
@@ -0,0 +1,121 @@
+use tokio_epoll_uring::BoundedBuf;
+use tokio_epoll_uring::BoundedBufMut;
+use tokio_epoll_uring::IoBufMut;
+use tokio_epoll_uring::Slice;
+
+pub(crate) trait SliceExt {
+    /// Get a `&mut[0..self.bytes_total()`] slice, for when you need to do borrow-based IO.
+    ///
+    /// See the test case `test_slice_full_zeroed` for the difference to just doing `&slice[..]`
+    fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8];
+}
+
+impl<B> SliceExt for Slice<B>
+where
+    B: IoBufMut,
+{
+    #[inline(always)]
+    fn as_mut_rust_slice_full_zeroed(&mut self) -> &mut [u8] {
+        // zero-initialize the uninitialized parts of the buffer so we can create a Rust slice
+        //
+        // SAFETY: we own `slice`, don't write outside the bounds
+        unsafe {
+            let to_init = self.bytes_total() - self.bytes_init();
+            self.stable_mut_ptr()
+                .add(self.bytes_init())
+                .write_bytes(0, to_init);
+            self.set_init(self.bytes_total());
+        };
+        let bytes_total = self.bytes_total();
+        &mut self[0..bytes_total]
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::io::Read;
+
+    use super::*;
+    use bytes::Buf;
+    use tokio_epoll_uring::Slice;
+
+    #[test]
+    fn test_slice_full_zeroed() {
+        let make_fake_file = || bytes::BytesMut::from(&b"12345"[..]).reader();
+
+        // before we start the test, let's make sure we have a shared understanding of what slice_full does
+        {
+            let buf = Vec::with_capacity(3);
+            let slice: Slice<_> = buf.slice_full();
+            assert_eq!(slice.bytes_init(), 0);
+            assert_eq!(slice.bytes_total(), 3);
+            let rust_slice = &slice[..];
+            assert_eq!(
+                rust_slice.len(),
+                0,
+                "Slice only derefs to a &[u8] of the initialized part"
+            );
+        }
+
+        // and also let's establish a shared understanding of .slice()
+        {
+            let buf = Vec::with_capacity(3);
+            let slice: Slice<_> = buf.slice(0..2);
+            assert_eq!(slice.bytes_init(), 0);
+            assert_eq!(slice.bytes_total(), 2);
+            let rust_slice = &slice[..];
+            assert_eq!(
+                rust_slice.len(),
+                0,
+                "Slice only derefs to a &[u8] of the initialized part"
+            );
+        }
+
+        // the above leads to the easy mistake of using slice[..] for borrow-based IO like so:
+        {
+            let buf = Vec::with_capacity(3);
+            let mut slice: Slice<_> = buf.slice_full();
+            assert_eq!(slice[..].len(), 0);
+            let mut file = make_fake_file();
+            file.read_exact(&mut slice[..]).unwrap(); // one might think this reads 3 bytes but it reads 0
+            assert_eq!(&slice[..] as &[u8], &[][..] as &[u8]);
+        }
+
+        // With owned buffers IO like with VirtualFilem, you could totally
+        // pass in a `Slice` with bytes_init()=0 but bytes_total()=5
+        // and it will read 5 bytes into the slice, and return a slice that has bytes_init()=5.
+        {
+            // TODO: demo
+        }
+
+        //
+        // Ok, now that we have a shared understanding let's demo how to use the extension trait.
+        //
+
+        // slice_full()
+        {
+            let buf = Vec::with_capacity(3);
+            let mut slice: Slice<_> = buf.slice_full();
+            let rust_slice = slice.as_mut_rust_slice_full_zeroed();
+            assert_eq!(rust_slice.len(), 3);
+            assert_eq!(rust_slice, &[0, 0, 0]);
+            let mut file = make_fake_file();
+            file.read_exact(rust_slice).unwrap();
+            assert_eq!(rust_slice, b"123");
+            assert_eq!(&slice[..], b"123");
+        }
+
+        // .slice(..)
+        {
+            let buf = Vec::with_capacity(3);
+            let mut slice: Slice<_> = buf.slice(0..2);
+            let rust_slice = slice.as_mut_rust_slice_full_zeroed();
+            assert_eq!(rust_slice.len(), 2);
+            assert_eq!(rust_slice, &[0, 0]);
+            let mut file = make_fake_file();
+            file.read_exact(rust_slice).unwrap();
+            assert_eq!(rust_slice, b"12");
+            assert_eq!(&slice[..], b"12");
+        }
+    }
+}
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 4f26f2f6d1f5..07c90385e654 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -343,7 +343,33 @@ impl WalIngest {
                         xlog_checkpoint.oldestActiveXid,
                         self.checkpoint.oldestActiveXid
                     );
-                    self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
+
+                    // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
+                    // because at shutdown, all in-progress transactions will implicitly
+                    // end. Postgres startup code knows that, and allows hot standby to start
+                    // immediately from a shutdown checkpoint.
+                    //
+                    // In Neon, Postgres hot standby startup always behaves as if starting from
+                    // an online checkpoint. It needs a valid `oldestActiveXid` value, so
+                    // instead of overwriting self.checkpoint.oldestActiveXid with
+                    // InvalidTransactionid from the checkpoint WAL record, update it to a
+                    // proper value, knowing that there are no in-progress transactions at this
+                    // point, except for prepared transactions.
+                    //
+                    // See also the neon code changes in the InitWalRecovery() function.
+                    if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
+                        && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                    {
+                        let mut oldest_active_xid = self.checkpoint.nextXid.value as u32;
+                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                            if (xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
+                                oldest_active_xid = xid;
+                            }
+                        }
+                        self.checkpoint.oldestActiveXid = oldest_active_xid;
+                    } else {
+                        self.checkpoint.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
+                    }
 
                     // Write a new checkpoint key-value pair on every checkpoint record, even
                     // if nothing really changed. Not strictly required, but it seems nice to
@@ -375,6 +401,7 @@ impl WalIngest {
                 if info == pg_constants::XLOG_RUNNING_XACTS {
                     let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
                     self.checkpoint.oldestActiveXid = xlrec.oldest_running_xid;
+                    self.checkpoint_modified = true;
                 }
             }
             pg_constants::RM_REPLORIGIN_ID => {
@@ -1277,13 +1304,10 @@ impl WalIngest {
             xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db
         );
 
-        // Here we treat oldestXid and oldestXidDB
-        // differently from postgres redo routines.
-        // In postgres checkpoint.oldestXid lags behind xlrec.oldest_xid
-        // until checkpoint happens and updates the value.
-        // Here we can use the most recent value.
-        // It's just an optimization, though and can be deleted.
-        // TODO Figure out if there will be any issues with replica.
+        // In Postgres, oldestXid and oldestXidDB are updated in memory when the CLOG is
+        // truncated, but a checkpoint record with the updated values isn't written until
+        // later. In Neon, a server can start at any LSN, not just on a checkpoint record,
+        // so we keep the oldestXid and oldestXidDB up-to-date.
         self.checkpoint.oldestXid = xlrec.oldest_xid;
         self.checkpoint.oldestXidDB = xlrec.oldest_xid_db;
         self.checkpoint_modified = true;
@@ -1384,14 +1408,31 @@ impl WalIngest {
             // Note: The multixact members can wrap around, even within one WAL record.
             offset = offset.wrapping_add(n_this_page as u32);
         }
-        if xlrec.mid >= self.checkpoint.nextMulti {
-            self.checkpoint.nextMulti = xlrec.mid + 1;
-            self.checkpoint_modified = true;
-        }
-        if xlrec.moff + xlrec.nmembers > self.checkpoint.nextMultiOffset {
-            self.checkpoint.nextMultiOffset = xlrec.moff + xlrec.nmembers;
+        let next_offset = offset;
+        assert!(xlrec.moff.wrapping_add(xlrec.nmembers) == next_offset);
+
+        // Update next-multi-xid and next-offset
+        //
+        // NB: In PostgreSQL, the next-multi-xid stored in the control file is allowed to
+        // go to 0, and it's fixed up by skipping to FirstMultiXactId in functions that
+        // read it, like GetNewMultiXactId(). This is different from how nextXid is
+        // incremented! nextXid skips over < FirstNormalTransactionId when the the value
+        // is stored, so it's never 0 in a checkpoint.
+        //
+        // I don't know why it's done that way, it seems less error-prone to skip over 0
+        // when the value is stored rather than when it's read. But let's do it the same
+        // way here.
+        let next_multi_xid = xlrec.mid.wrapping_add(1);
+
+        if self
+            .checkpoint
+            .update_next_multixid(next_multi_xid, next_offset)
+        {
             self.checkpoint_modified = true;
         }
+
+        // Also update the next-xid with the highest member. According to the comments in
+        // multixact_redo(), this shouldn't be necessary, but let's do the same here.
         let max_mbr_xid = xlrec.members.iter().fold(None, |acc, mbr| {
             if let Some(max_xid) = acc {
                 if mbr.xid.wrapping_sub(max_xid) as i32 > 0 {
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index d562540bde9b..5095beefd755 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -40,6 +40,7 @@ use std::time::Duration;
 use std::time::Instant;
 use tracing::*;
 use utils::lsn::Lsn;
+use utils::sync::gate::GateError;
 use utils::sync::heavier_once_cell;
 
 ///
@@ -53,10 +54,18 @@ pub struct PostgresRedoManager {
     tenant_shard_id: TenantShardId,
     conf: &'static PageServerConf,
     last_redo_at: std::sync::Mutex<Option<Instant>>,
-    /// The current [`process::WalRedoProcess`] that is used by new redo requests.
-    /// We use [`heavier_once_cell`] for coalescing the spawning, but the redo
-    /// requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
+    /// We use [`heavier_once_cell`] for
+    ///
+    /// 1. coalescing the lazy spawning of walredo processes ([`ProcessOnceCell::Spawned`])
+    /// 2. prevent new processes from being spawned on [`Self::shutdown`] (=> [`ProcessOnceCell::ManagerShutDown`]).
+    ///
+    /// # Spawning
+    ///
+    /// Redo requests use the once cell to coalesce onto one call to [`process::WalRedoProcess::launch`].
+    ///
+    /// Notably, requests don't use the [`heavier_once_cell::Guard`] to keep ahold of the
     /// their process object; we use [`Arc::clone`] for that.
+    ///
     /// This is primarily because earlier implementations that didn't  use [`heavier_once_cell`]
     /// had that behavior; it's probably unnecessary.
     /// The only merit of it is that if one walredo process encounters an error,
@@ -65,7 +74,63 @@ pub struct PostgresRedoManager {
     /// still be using the old redo process. But, those other tasks will most likely
     /// encounter an error as well, and errors are an unexpected condition anyway.
     /// So, probably we could get rid of the `Arc` in the future.
-    redo_process: heavier_once_cell::OnceCell<Arc<process::WalRedoProcess>>,
+    ///
+    /// # Shutdown
+    ///
+    /// See [`Self::launched_processes`].
+    redo_process: heavier_once_cell::OnceCell<ProcessOnceCell>,
+
+    /// Gate that is entered when launching a walredo process and held open
+    /// until the process has been `kill()`ed and `wait()`ed upon.
+    ///
+    /// Manager shutdown waits for this gate to close after setting the
+    /// [`ProcessOnceCell::ManagerShutDown`] state in [`Self::redo_process`].
+    ///
+    /// This type of usage is a bit unusual because gates usually keep track of
+    /// concurrent operations, e.g., every [`Self::request_redo`] that is inflight.
+    /// But we use it here to keep track of the _processes_ that we have launched,
+    /// which may outlive any individual redo request because
+    /// - we keep walredo process around until its quiesced to amortize spawn cost and
+    /// - the Arc may be held by multiple concurrent redo requests, so, just because
+    ///   you replace the [`Self::redo_process`] cell's content doesn't mean the
+    ///   process gets killed immediately.
+    ///
+    /// We could simplify this by getting rid of the [`Arc`].
+    /// See the comment on [`Self::redo_process`] for more details.
+    launched_processes: utils::sync::gate::Gate,
+}
+
+/// See [`PostgresRedoManager::redo_process`].
+enum ProcessOnceCell {
+    Spawned(Arc<Process>),
+    ManagerShutDown,
+}
+
+struct Process {
+    _launched_processes_guard: utils::sync::gate::GateGuard,
+    process: process::WalRedoProcess,
+}
+
+impl std::ops::Deref for Process {
+    type Target = process::WalRedoProcess;
+
+    fn deref(&self) -> &Self::Target {
+        &self.process
+    }
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum Error {
+    #[error("cancelled")]
+    Cancelled,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+macro_rules! bail {
+    ($($arg:tt)*) => {
+        return Err($crate::walredo::Error::Other(::anyhow::anyhow!($($arg)*)));
+    }
 }
 
 ///
@@ -88,9 +153,9 @@ impl PostgresRedoManager {
         base_img: Option<(Lsn, Bytes)>,
         records: Vec<(Lsn, NeonWalRecord)>,
         pg_version: u32,
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, Error> {
         if records.is_empty() {
-            anyhow::bail!("invalid WAL redo request with no records");
+            bail!("invalid WAL redo request with no records");
         }
 
         let base_img_lsn = base_img.as_ref().map(|p| p.0).unwrap_or(Lsn::INVALID);
@@ -148,10 +213,10 @@ impl PostgresRedoManager {
                     chrono::Utc::now().checked_sub_signed(chrono::Duration::from_std(age).ok()?)
                 })
             },
-            process: self
-                .redo_process
-                .get()
-                .map(|p| WalRedoManagerProcessStatus { pid: p.id() }),
+            process: self.redo_process.get().and_then(|p| match &*p {
+                ProcessOnceCell::Spawned(p) => Some(WalRedoManagerProcessStatus { pid: p.id() }),
+                ProcessOnceCell::ManagerShutDown => None,
+            }),
         }
     }
 }
@@ -170,9 +235,39 @@ impl PostgresRedoManager {
             conf,
             last_redo_at: std::sync::Mutex::default(),
             redo_process: heavier_once_cell::OnceCell::default(),
+            launched_processes: utils::sync::gate::Gate::default(),
         }
     }
 
+    /// Shut down the WAL redo manager.
+    ///
+    /// After this future completes
+    /// - no redo process is running
+    /// - no new redo process will be spawned
+    /// - redo requests that need walredo process will fail with [`Error::Cancelled`]
+    /// - [`apply_neon`]-only redo requests may still work, but this may change in the future
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
+    pub async fn shutdown(&self) {
+        // prevent new processes from being spawned
+        let permit = match self.redo_process.get_or_init_detached().await {
+            Ok(guard) => {
+                let (proc, permit) = guard.take_and_deinit();
+                drop(proc); // this just drops the Arc, its refcount may not be zero yet
+                permit
+            }
+            Err(permit) => permit,
+        };
+        self.redo_process
+            .set(ProcessOnceCell::ManagerShutDown, permit);
+        // wait for ongoing requests to drain and the refcounts of all Arc<WalRedoProcess> that
+        // we ever launched to drop to zero, which when it happens synchronously kill()s & wait()s
+        // for the underlying process.
+        self.launched_processes.close().await;
+    }
+
     /// This type doesn't have its own background task to check for idleness: we
     /// rely on our owner calling this function periodically in its own housekeeping
     /// loops.
@@ -203,38 +298,48 @@ impl PostgresRedoManager {
         records: &[(Lsn, NeonWalRecord)],
         wal_redo_timeout: Duration,
         pg_version: u32,
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, Error> {
         *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
 
         let (rel, blknum) = key.to_rel_block().context("invalid record")?;
         const MAX_RETRY_ATTEMPTS: u32 = 1;
         let mut n_attempts = 0u32;
         loop {
-            let proc: Arc<process::WalRedoProcess> =
-                match self.redo_process.get_or_init_detached().await {
-                    Ok(guard) => Arc::clone(&guard),
-                    Err(permit) => {
-                        // don't hold poison_guard, the launch code can bail
-                        let start = Instant::now();
-                        let proc = Arc::new(
-                            process::WalRedoProcess::launch(
+            let proc: Arc<Process> = match self.redo_process.get_or_init_detached().await {
+                Ok(guard) => match &*guard {
+                    ProcessOnceCell::Spawned(proc) => Arc::clone(proc),
+                    ProcessOnceCell::ManagerShutDown => {
+                        return Err(Error::Cancelled);
+                    }
+                },
+                Err(permit) => {
+                    let start = Instant::now();
+                    let proc = Arc::new(Process {
+                            _launched_processes_guard: match self.launched_processes.enter() {
+                                Ok(guard) => guard,
+                                Err(GateError::GateClosed) => unreachable!(
+                                    "shutdown sets the once cell to `ManagerShutDown` state before closing the gate"
+                                ),
+                            },
+                            process: process::WalRedoProcess::launch(
                                 self.conf,
                                 self.tenant_shard_id,
                                 pg_version,
                             )
                             .context("launch walredo process")?,
-                        );
-                        let duration = start.elapsed();
-                        WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
-                        info!(
-                            duration_ms = duration.as_millis(),
-                            pid = proc.id(),
-                            "launched walredo process"
-                        );
-                        self.redo_process.set(Arc::clone(&proc), permit);
-                        proc
-                    }
-                };
+                        });
+                    let duration = start.elapsed();
+                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.observe(duration.as_secs_f64());
+                    info!(
+                        duration_ms = duration.as_millis(),
+                        pid = proc.id(),
+                        "launched walredo process"
+                    );
+                    self.redo_process
+                        .set(ProcessOnceCell::Spawned(Arc::clone(&proc)), permit);
+                    proc
+                }
+            };
 
             let started_at = std::time::Instant::now();
 
@@ -299,12 +404,17 @@ impl PostgresRedoManager {
                 match self.redo_process.get() {
                     None => (),
                     Some(guard) => {
-                        if Arc::ptr_eq(&proc, &*guard) {
-                            // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
-                            guard.take_and_deinit();
-                        } else {
-                            // Another task already spawned another redo process (further up in this method)
-                            // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                        match &*guard {
+                            ProcessOnceCell::ManagerShutDown => {}
+                            ProcessOnceCell::Spawned(guard_proc) => {
+                                if Arc::ptr_eq(&proc, guard_proc) {
+                                    // We're the first to observe an error from `proc`, it's our job to take it out of rotation.
+                                    guard.take_and_deinit();
+                                } else {
+                                    // Another task already spawned another redo process (further up in this method)
+                                    // and put it into `redo_process`. Do nothing, our view of the world is behind.
+                                }
+                            }
                         }
                     }
                 }
@@ -315,7 +425,7 @@ impl PostgresRedoManager {
             }
             n_attempts += 1;
             if n_attempts > MAX_RETRY_ATTEMPTS || result.is_ok() {
-                return result;
+                return result.map_err(Error::Other);
             }
         }
     }
@@ -329,7 +439,7 @@ impl PostgresRedoManager {
         lsn: Lsn,
         base_img: Option<Bytes>,
         records: &[(Lsn, NeonWalRecord)],
-    ) -> anyhow::Result<Bytes> {
+    ) -> Result<Bytes, Error> {
         let start_time = Instant::now();
 
         let mut page = BytesMut::new();
@@ -338,7 +448,7 @@ impl PostgresRedoManager {
             page.extend_from_slice(&fpi[..]);
         } else {
             // All the current WAL record types that we can handle require a base image.
-            anyhow::bail!("invalid neon WAL redo request with no base image");
+            bail!("invalid neon WAL redo request with no base image");
         }
 
         // Apply all the WAL records in the batch
diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index cd316dbb9141..3b755bb0420c 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -6,6 +6,7 @@ OBJS = \
 	$(WIN32RES) \
 	extension_server.o \
 	file_cache.o \
+	hll.o \
 	libpagestore.o \
 	neon.o \
 	neon_utils.o \
@@ -22,7 +23,7 @@ SHLIB_LINK_INTERNAL = $(libpq)
 SHLIB_LINK = -lcurl
 
 EXTENSION = neon
-DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql
+DATA = neon--1.0.sql neon--1.0--1.1.sql neon--1.1--1.2.sql neon--1.2--1.3.sql neon--1.3--1.2.sql neon--1.2--1.1.sql neon--1.1--1.0.sql  neon--1.3--1.4.sql neon--1.4--1.3.sql
 PGFILEDESC = "neon - cloud storage for PostgreSQL"
 
 EXTRA_CLEAN = \
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 25275ef31fe9..1894e8c72a5c 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -26,7 +26,6 @@
 #include "miscadmin.h"
 #include "pagestore_client.h"
 #include "common/hashfn.h"
-#include "lib/hyperloglog.h"
 #include "pgstat.h"
 #include "postmaster/bgworker.h"
 #include RELFILEINFO_HDR
@@ -40,6 +39,8 @@
 #include "utils/dynahash.h"
 #include "utils/guc.h"
 
+#include "hll.h"
+
 /*
  * Local file cache is used to temporary store relations pages in local file system.
  * All blocks of all relations are stored inside one file and addressed using shared hash map.
@@ -62,7 +63,6 @@
 #define BLOCKS_PER_CHUNK	128 /* 1Mb chunk */
 #define MB					((uint64)1024*1024)
 
-#define HYPER_LOG_LOG_BIT_WIDTH   10
 #define SIZE_MB_TO_CHUNKS(size) ((uint32)((size) * MB / BLCKSZ / BLOCKS_PER_CHUNK))
 
 typedef struct FileCacheEntry
@@ -87,8 +87,7 @@ typedef struct FileCacheControl
 	uint64		writes;
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
-	hyperLogLogState wss_estimation; /* estimation of wroking set size */
-	uint8_t		hyperloglog_hashes[(1 << HYPER_LOG_LOG_BIT_WIDTH) + 1];
+	HyperLogLogState wss_estimation; /* estimation of working set size */
 } FileCacheControl;
 
 static HTAB *lfc_hash;
@@ -238,12 +237,7 @@ lfc_shmem_startup(void)
 		dlist_init(&lfc_ctl->lru);
 
 		/* Initialize hyper-log-log structure for estimating working set size */
-		initHyperLogLog(&lfc_ctl->wss_estimation, HYPER_LOG_LOG_BIT_WIDTH);
-
-		/* We need hashes in shared memory */
-		pfree(lfc_ctl->wss_estimation.hashesArr);
-		memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
-		lfc_ctl->wss_estimation.hashesArr = lfc_ctl->hyperloglog_hashes;
+		initSHLL(&lfc_ctl->wss_estimation);
 
 		/* Recreate file cache on restart */
 		fd = BasicOpenFile(lfc_path, O_RDWR | O_CREAT | O_TRUNC);
@@ -545,7 +539,7 @@ lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	/* Approximate working set */
 	tag.blockNum = blkno;
-	addHyperLogLog(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
 
 	if (entry == NULL || (entry->bitmap[chunk_offs >> 5] & (1 << (chunk_offs & 31))) == 0)
 	{
@@ -986,20 +980,38 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		SRF_RETURN_DONE(funcctx);
 }
 
+PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
+
+Datum
+approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+{
+	if (lfc_size_limit != 0)
+	{
+		int32 dc;
+		time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
+		LWLockAcquire(lfc_lock, LW_SHARED);
+		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
+		LWLockRelease(lfc_lock);
+		PG_RETURN_INT32(dc);
+	}
+	PG_RETURN_NULL();
+}
+
 PG_FUNCTION_INFO_V1(approximate_working_set_size);
 
 Datum
 approximate_working_set_size(PG_FUNCTION_ARGS)
 {
-	int32 dc = -1;
 	if (lfc_size_limit != 0)
 	{
+		int32 dc;
 		bool reset = PG_GETARG_BOOL(0);
 		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
-		dc = (int32) estimateHyperLogLog(&lfc_ctl->wss_estimation);
+		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
 		if (reset)
-			memset(lfc_ctl->hyperloglog_hashes, 0, sizeof lfc_ctl->hyperloglog_hashes);
+			memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
 		LWLockRelease(lfc_lock);
+		PG_RETURN_INT32(dc);
 	}
-	PG_RETURN_INT32(dc);
+	PG_RETURN_NULL();
 }
diff --git a/pgxn/neon/hll.c b/pgxn/neon/hll.c
new file mode 100644
index 000000000000..f8496b31259d
--- /dev/null
+++ b/pgxn/neon/hll.c
@@ -0,0 +1,193 @@
+/*-------------------------------------------------------------------------
+ *
+ * hll.c
+ *	  Sliding HyperLogLog cardinality estimator
+ *
+ * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
+ *
+ * Implements https://hal.science/hal-00465313/document
+ * 
+ * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
+ * suited to estimating the cardinality of very large sets;  in particular, we
+ * have not attempted to further optimize the implementation as described in
+ * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+ * Engineering of a State of The Art Cardinality Estimation Algorithm".
+ *
+ * A sparse representation of HyperLogLog state is used, with fixed space
+ * overhead.
+ *
+ * The copyright terms of Ohno's original version (the MIT license) follow.
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/hyperloglog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <math.h>
+
+#include "postgres.h"
+#include "funcapi.h"
+#include "port/pg_bitutils.h"
+#include "utils/timestamp.h"
+#include "hll.h"
+
+
+#define POW_2_32			(4294967296.0)
+#define NEG_POW_2_32		(-4294967296.0)
+
+#define ALPHA_MM ((0.7213 / (1.0 + 1.079 / HLL_N_REGISTERS)) * HLL_N_REGISTERS * HLL_N_REGISTERS)
+
+/*
+ * Worker for addHyperLogLog().
+ *
+ * Calculates the position of the first set bit in first b bits of x argument
+ * starting from the first, reading from most significant to least significant
+ * bits.
+ *
+ * Example (when considering fist 10 bits of x):
+ *
+ * rho(x = 0b1000000000)   returns 1
+ * rho(x = 0b0010000000)   returns 3
+ * rho(x = 0b0000000000)   returns b + 1
+ *
+ * "The binary address determined by the first b bits of x"
+ *
+ * Return value "j" used to index bit pattern to watch.
+ */
+static inline uint8
+rho(uint32 x, uint8 b)
+{
+	uint8		j = 1;
+
+	if (x == 0)
+		return b + 1;
+
+	j = 32 - pg_leftmost_one_pos32(x);
+
+	if (j > b)
+		return b + 1;
+
+	return j;
+}
+
+/*
+ * Initialize HyperLogLog track state
+ */
+void
+initSHLL(HyperLogLogState *cState)
+{
+	memset(cState->regs, 0, sizeof(cState->regs));
+}
+
+/*
+ * Adds element to the estimator, from caller-supplied hash.
+ *
+ * It is critical that the hash value passed be an actual hash value, typically
+ * generated using hash_any().  The algorithm relies on a specific bit-pattern
+ * observable in conjunction with stochastic averaging.  There must be a
+ * uniform distribution of bits in hash values for each distinct original value
+ * observed.
+ */
+void
+addSHLL(HyperLogLogState *cState, uint32 hash)
+{
+	uint8		count;
+	uint32		index;
+	size_t		i;
+	size_t		j;
+
+	TimestampTz	now = GetCurrentTimestamp();
+	/* Use the first "k" (registerWidth) bits as a zero based index */
+	index = hash >> HLL_C_BITS;
+
+	/* Compute the rank of the remaining 32 - "k" (registerWidth) bits */
+	count = rho(hash << HLL_BIT_WIDTH, HLL_C_BITS);
+
+	cState->regs[index][count] = now;
+}
+
+static uint8
+getMaximum(const TimestampTz* reg, TimestampTz since)
+{
+	uint8 max = 0;
+
+	for (size_t i = 0; i < HLL_C_BITS + 1; i++)
+	{
+		if (reg[i] >= since)
+		{
+			max = i;
+		}
+	}
+
+	return max;
+}
+
+
+/*
+ * Estimates cardinality, based on elements added so far
+ */
+double
+estimateSHLL(HyperLogLogState *cState, time_t duration)
+{
+	double		result;
+	double		sum = 0.0;
+	size_t		i;
+	uint8       R[HLL_N_REGISTERS];
+	/* 0 indicates uninitialized timestamp, so if we need to cover the whole range than starts with 1 */
+	TimestampTz since = duration == (time_t)-1 ? 1 : GetCurrentTimestamp() - duration * USECS_PER_SEC;
+
+	for (i = 0; i < HLL_N_REGISTERS; i++)
+	{
+		R[i] = getMaximum(cState->regs[i], since);
+		sum += 1.0 / pow(2.0, R[i]);
+	}
+
+	/* result set to "raw" HyperLogLog estimate (E in the HyperLogLog paper) */
+	result = ALPHA_MM / sum;
+
+	if (result <= (5.0 / 2.0) * HLL_N_REGISTERS)
+	{
+		/* Small range correction */
+		int			zero_count = 0;
+
+		for (i = 0; i < HLL_N_REGISTERS; i++)
+		{
+			zero_count += R[i] == 0;
+		}
+
+		if (zero_count != 0)
+			result = HLL_N_REGISTERS * log((double) HLL_N_REGISTERS /
+										   zero_count);
+	}
+	else if (result > (1.0 / 30.0) * POW_2_32)
+	{
+		/* Large range correction */
+		result = NEG_POW_2_32 * log(1.0 - (result / POW_2_32));
+	}
+
+	return result;
+}
+
diff --git a/pgxn/neon/hll.h b/pgxn/neon/hll.h
new file mode 100644
index 000000000000..9256cb9afa2f
--- /dev/null
+++ b/pgxn/neon/hll.h
@@ -0,0 +1,86 @@
+/*-------------------------------------------------------------------------
+ *
+ * hll.h
+ *	  Sliding HyperLogLog cardinality estimator
+ *
+ * Portions Copyright (c) 2014-2023, PostgreSQL Global Development Group
+ *
+ * Implements https://hal.science/hal-00465313/document
+ * 
+ * Based on Hideaki Ohno's C++ implementation.  This is probably not ideally
+ * suited to estimating the cardinality of very large sets;  in particular, we
+ * have not attempted to further optimize the implementation as described in
+ * the Heule, Nunkesser and Hall paper "HyperLogLog in Practice: Algorithmic
+ * Engineering of a State of The Art Cardinality Estimation Algorithm".
+ *
+ * A sparse representation of HyperLogLog state is used, with fixed space
+ * overhead.
+ *
+ * The copyright terms of Ohno's original version (the MIT license) follow.
+ *
+ * IDENTIFICATION
+ *	  src/backend/lib/hyperloglog.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+/*
+ * Copyright (c) 2013 Hideaki Ohno <hide.o.j55{at}gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the 'Software'), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef HLL_H
+#define HLL_H
+
+#define HLL_BIT_WIDTH   10
+#define HLL_C_BITS      (32 - HLL_BIT_WIDTH)
+#define HLL_N_REGISTERS (1 << HLL_BIT_WIDTH)
+
+/*
+ * HyperLogLog is an approximate technique for computing the number of distinct
+ * entries in a set.  Importantly, it does this by using a fixed amount of
+ * memory.  See the 2007 paper "HyperLogLog: the analysis of a near-optimal
+ * cardinality estimation algorithm" for more.
+ *
+ * Instead of a single counter for every bits register, we have a timestamp
+ * for every valid number of bits we can encounter. Every time we encounter
+ * a certain number of bits, we update the timestamp in those registers to
+ * the current timestamp.
+ *
+ * We can query the sketch's stored cardinality for the range of some timestamp
+ * up to now: For each register, we return the highest bits bucket that has a
+ * modified timestamp >= the query timestamp. This value is the number of bits
+ * for this register in the normal HLL calculation.
+ *
+ * The memory usage is 2^B * (C + 1) * sizeof(TimetampTz), or 184kiB.
+ * Usage could be halved if we decide to reduce the required time dimension
+ * precision; as 32 bits in second precision should be enough for statistics.
+ * However, that is not yet implemented.
+ */
+typedef struct HyperLogLogState
+{
+	TimestampTz regs[HLL_N_REGISTERS][HLL_C_BITS + 1];
+} HyperLogLogState;
+
+extern void   initSHLL(HyperLogLogState *cState);
+extern void   addSHLL(HyperLogLogState *cState, uint32 hash);
+extern double estimateSHLL(HyperLogLogState *cState, time_t dutration);
+
+#endif
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index a665cafafe71..73a001b6ba72 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -427,12 +427,17 @@ pageserver_connect(shardno_t shard_no, int elevel)
 		values[n_pgsql_params] = NULL;
 
 		shard->conn = PQconnectStartParams(keywords, values, 1);
-		if (!shard->conn)
+		if (PQstatus(shard->conn) == CONNECTION_BAD)
 		{
-			neon_shard_log(shard_no, elevel, "Failed to connect to pageserver: out of memory");
+			char	   *msg = pchomp(PQerrorMessage(shard->conn));
+			CLEANUP_AND_DISCONNECT(shard);
+			ereport(elevel,
+					(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+						errmsg(NEON_TAG "[shard %d] could not establish connection to pageserver", shard_no),
+						errdetail_internal("%s", msg)));
+			pfree(msg);
 			return false;
 		}
-
 		shard->state = PS_Connecting_Startup;
 		/* fallthrough */
 	}
diff --git a/pgxn/neon/neon--1.3--1.4.sql b/pgxn/neon/neon--1.3--1.4.sql
new file mode 100644
index 000000000000..042effe3461c
--- /dev/null
+++ b/pgxn/neon/neon--1.3--1.4.sql
@@ -0,0 +1,9 @@
+\echo Use "ALTER EXTENSION neon UPDATE TO '1.4'" to load this file. \quit
+
+CREATE FUNCTION approximate_working_set_size_seconds(duration integer default null)
+RETURNS integer
+AS 'MODULE_PATHNAME', 'approximate_working_set_size_seconds'
+LANGUAGE C PARALLEL SAFE;
+
+GRANT EXECUTE ON FUNCTION approximate_working_set_size_seconds(integer) TO pg_monitor;
+
diff --git a/pgxn/neon/neon--1.4--1.3.sql b/pgxn/neon/neon--1.4--1.3.sql
new file mode 100644
index 000000000000..bea72d1a6b17
--- /dev/null
+++ b/pgxn/neon/neon--1.4--1.3.sql
@@ -0,0 +1 @@
+DROP FUNCTION IF EXISTS approximate_working_set_size_seconds(integer) CASCADE;
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index b6b2db7e71ad..e4968bdf8991 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -12,6 +12,8 @@
 #include "fmgr.h"
 
 #include "miscadmin.h"
+#include "access/subtrans.h"
+#include "access/twophase.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "storage/buf_internals.h"
@@ -22,10 +24,12 @@
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
+#include "storage/proc.h"
 #include "storage/procsignal.h"
 #include "tcop/tcopprot.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
+#include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
 #include "utils/wait_event.h"
@@ -266,6 +270,293 @@ LogicalSlotsMonitorMain(Datum main_arg)
 	}
 }
 
+/*
+ * XXX: These private to procarray.c, but we need them here.
+ */
+#define PROCARRAY_MAXPROCS	(MaxBackends + max_prepared_xacts)
+#define TOTAL_MAX_CACHED_SUBXIDS \
+	((PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS)
+
+/*
+ * Restore running-xact information by scanning the CLOG at startup.
+ *
+ * In PostgreSQL, a standby always has to wait for a running-xacts WAL record
+ * to arrive before it can start accepting queries. Furthermore, if there are
+ * transactions with too many subxids (> 64) open to fit in the in-memory
+ * subxids cache, the running-xacts record will be marked as "suboverflowed",
+ * and the standby will need to also wait for the currently in-progress
+ * transactions to finish.
+ *
+ * That's not great in PostgreSQL, because a hot standby does not necessary
+ * open up for queries immediately as you might expect. But it's worse in
+ * Neon: A standby in Neon doesn't need to start WAL replay from a checkpoint
+ * record; it can start at any LSN. Postgres arranges things so that there is
+ * a running-xacts record soon after every checkpoint record, but when you
+ * start from an arbitrary LSN, that doesn't help. If the primary is idle, or
+ * not running at all, it might never write a new running-xacts record,
+ * leaving the replica in a limbo where it can never start accepting queries.
+ *
+ * To mitigate that, we have an additional mechanism to find the running-xacts
+ * information: we scan the CLOG, making note of any XIDs not marked as
+ * committed or aborted. They are added to the Postgres known-assigned XIDs
+ * array by calling ProcArrayApplyRecoveryInfo() in the caller of this
+ * function.
+ *
+ * There is one big limitation with that mechanism: The size of the
+ * known-assigned XIDs is limited, so if there are a lot of in-progress XIDs,
+ * we have to give up. Furthermore, we don't know how many of the in-progress
+ * XIDs are subtransactions, and if we use up all the space in the
+ * known-assigned XIDs array for subtransactions, we might run out of space in
+ * the array later during WAL replay, causing the replica to shut down with
+ * "ERROR: too many KnownAssignedXids". The safe # of XIDs that we can add to
+ * the known-assigned array without risking that error later is very low,
+ * merely PGPROC_MAX_CACHED_SUBXIDS == 64, so we take our chances and use up
+ * to half of the known-assigned XIDs array for the subtransactions, even
+ * though that risks getting the error later.
+ *
+ * Note: It's OK if the recovered list of XIDs includes some transactions that
+ * have crashed in the primary, and hence will never commit. They will be seen
+ * as in-progress, until we see a new next running-acts record with an
+ * oldestActiveXid that invalidates them. That's how the known-assigned XIDs
+ * array always works.
+ *
+ * If scraping the CLOG doesn't succeed for some reason, like the subxid
+ * overflow, Postgres will fall back to waiting for a running-xacts record
+ * like usual.
+ *
+ * Returns true if a complete list of in-progress XIDs was scraped.
+ */
+static bool
+RestoreRunningXactsFromClog(CheckPoint *checkpoint, TransactionId **xids, int *nxids)
+{
+	TransactionId from;
+	TransactionId till;
+	int			max_xcnt;
+	TransactionId *prepared_xids = NULL;
+	int			n_prepared_xids;
+	TransactionId *restored_xids = NULL;
+	int			n_restored_xids;
+	int			next_prepared_idx;
+
+	Assert(*xids == NULL);
+
+	/*
+	 * If the checkpoint doesn't have a valid oldestActiveXid, bail out. We
+	 * don't know where to start the scan.
+	 *
+	 * This shouldn't happen, because the pageserver always maintains a valid
+	 * oldestActiveXid nowadays. Except when starting at an old point in time
+	 * that was ingested before the pageserver was taught to do that.
+	 */
+	if (!TransactionIdIsValid(checkpoint->oldestActiveXid))
+	{
+		elog(LOG, "cannot restore running-xacts from CLOG because oldestActiveXid is not set");
+		goto fail;
+	}
+
+	/*
+	 * We will scan the CLOG starting from the oldest active XID.
+	 *
+	 * In some corner cases, the oldestActiveXid from the last checkpoint
+	 * might already have been truncated from the CLOG. That is,
+	 * oldestActiveXid might be older than oldestXid. That's possible because
+	 * oldestActiveXid is only updated at checkpoints. After the last
+	 * checkpoint, the oldest transaction might have committed, and the CLOG
+	 * might also have been already truncated. So if oldestActiveXid is older
+	 * than oldestXid, start at oldestXid instead. (Otherwise we'd try to
+	 * access CLOG segments that have already been truncated away.)
+	 */
+	from = TransactionIdPrecedes(checkpoint->oldestXid, checkpoint->oldestActiveXid)
+		? checkpoint->oldestActiveXid : checkpoint->oldestXid;
+	till = XidFromFullTransactionId(checkpoint->nextXid);
+
+	/*
+	 * To avoid "too many KnownAssignedXids" error later during replay, we
+	 * limit number of collected transactions. This is a tradeoff: if we are
+	 * willing to consume more of the KnownAssignedXids space for the XIDs
+	 * now, that allows us to start up, but we might run out of space later.
+	 *
+	 * The size of the KnownAssignedXids array is TOTAL_MAX_CACHED_SUBXIDS,
+	 * which is (PGPROC_MAX_CACHED_SUBXIDS + 1) * PROCARRAY_MAXPROCS). In
+	 * PostgreSQL, that's always enough because the primary will always write
+	 * an XLOG_XACT_ASSIGNMENT record if a transaction has more than
+	 * PGPROC_MAX_CACHED_SUBXIDS subtransactions. Seeing that record allows
+	 * the standby to mark the XIDs in pg_subtrans and removing them from the
+	 * KnowingAssignedXids array.
+	 *
+	 * Here, we don't know which XIDs belong to subtransactions that have
+	 * already been WAL-logged with an XLOG_XACT_ASSIGNMENT record. If we
+	 * wanted to be totally safe and avoid the possibility of getting a "too
+	 * many KnownAssignedXids" error later, we would have to limit ourselves
+	 * to PGPROC_MAX_CACHED_SUBXIDS, which is not much. And that includes top
+	 * transaction IDs too, because we cannot distinguish between top
+	 * transaction IDs and subtransactions here.
+	 *
+	 * Somewhat arbitrarily, we use up to half of KnownAssignedXids. That
+	 * strikes a sensible balance between being useful, and risking a "too
+	 * many KnownAssignedXids" error later.
+	 */
+	max_xcnt = TOTAL_MAX_CACHED_SUBXIDS / 2;
+
+	/*
+	 * Collect XIDs of prepared transactions in an array. This includes only
+	 * their top-level XIDs. We assume that StandbyRecoverPreparedTransactions
+	 * has already been called, so we can find all the sub-transactions in
+	 * pg_subtrans.
+	 */
+	PrescanPreparedTransactions(&prepared_xids, &n_prepared_xids);
+	qsort(prepared_xids, n_prepared_xids, sizeof(TransactionId), xidLogicalComparator);
+
+	/*
+	 * Scan the CLOG, collecting in-progress XIDs into 'restored_xids'.
+	 */
+	elog(DEBUG1, "scanning CLOG between %u and %u for in-progress XIDs", from, till);
+	restored_xids = (TransactionId *) palloc(max_xcnt * sizeof(TransactionId));
+	n_restored_xids = 0;
+	next_prepared_idx = 0;
+	for (TransactionId xid = from; xid != till;)
+	{
+		XLogRecPtr	xidlsn;
+		XidStatus	xidstatus;
+
+		xidstatus = TransactionIdGetStatus(xid, &xidlsn);
+
+		/*
+		 * "Merge" the prepared transactions into the restored_xids array as
+		 * we go.  The prepared transactions array is sorted. This is mostly
+		 * a sanity check to ensure that all the prpeared transactions are
+		 * seen as in-progress. (There is a check after the loop that we didn't
+		 * miss any.)
+		 */
+		if (next_prepared_idx < n_prepared_xids && xid == prepared_xids[next_prepared_idx])
+		{
+			/*
+			 * This is a top-level transaction ID of a prepared transaction.
+			 * Include it in the array.
+			 */
+
+			/* sanity check */
+			if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS)
+			{
+				elog(LOG, "prepared transaction %u has unexpected status %X, cannot restore running-xacts from CLOG",
+					 xid, xidstatus);
+				Assert(false);
+				goto fail;
+			}
+
+			elog(DEBUG1, "XID %u: was next prepared xact (%d / %d)", xid, next_prepared_idx, n_prepared_xids);
+			next_prepared_idx++;
+		}
+		else if (xidstatus == TRANSACTION_STATUS_COMMITTED)
+		{
+			elog(DEBUG1, "XID %u: was committed", xid);
+			goto skip;
+		}
+		else if (xidstatus == TRANSACTION_STATUS_ABORTED)
+		{
+			elog(DEBUG1, "XID %u: was aborted", xid);
+			goto skip;
+		}
+		else if (xidstatus == TRANSACTION_STATUS_IN_PROGRESS)
+		{
+			/*
+			 * In-progress transactions are included in the array.
+			 *
+			 * Except subtransactions of the prepared transactions. They are
+			 * already set in pg_subtrans, and hence don't need to be tracked
+			 * in the known-assigned XIDs array.
+			 */
+			if (n_prepared_xids > 0)
+			{
+				TransactionId parent = SubTransGetParent(xid);
+
+				if (TransactionIdIsValid(parent))
+				{
+					/*
+					 * This is a subtransaction belonging to a prepared
+					 * transaction.
+					 *
+					 * Sanity check that it is in the prepared XIDs array. It
+					 * should be, because StandbyRecoverPreparedTransactions
+					 * populated pg_subtrans, and no other XID should be set
+					 * in it yet. (This also relies on the fact that
+					 * StandbyRecoverPreparedTransactions sets the parent of
+					 * each subxid to point directly to the top-level XID,
+					 * rather than restoring the original subtransaction
+					 * hierarchy.)
+					 */
+					if (bsearch(&parent, prepared_xids, next_prepared_idx,
+								sizeof(TransactionId), xidLogicalComparator) == NULL)
+					{
+						elog(LOG, "sub-XID %u has unexpected parent %u, cannot restore running-xacts from CLOG",
+							 xid, parent);
+						Assert(false);
+						goto fail;
+					}
+					elog(DEBUG1, "XID %u: was a subtransaction of prepared xid %u", xid, parent);
+					goto skip;
+				}
+			}
+
+			/* include it in the array */
+			elog(DEBUG1, "XID %u: is in progress", xid);
+		}
+		else
+		{
+			/*
+			 * SUB_COMMITTED is a transient state used at commit. We don't
+			 * expect to see that here.
+			 */
+			elog(LOG, "XID %u has unexpected status %X in pg_xact, cannot restore running-xacts from CLOG",
+				 xid, xidstatus);
+			Assert(false);
+			goto fail;
+		}
+
+		if (n_restored_xids >= max_xcnt)
+		{
+			/*
+			 * Overflowed. We won't be able to install the RunningTransactions
+			 * snapshot.
+			 */
+			elog(LOG, "too many running xacts to restore from the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
+				 checkpoint->oldestXid, checkpoint->oldestActiveXid,
+				 XidFromFullTransactionId(checkpoint->nextXid));
+			goto fail;
+		}
+
+		restored_xids[n_restored_xids++] = xid;
+
+	skip:
+		TransactionIdAdvance(xid);
+		continue;
+	}
+
+	/* sanity check */
+	if (next_prepared_idx != n_prepared_xids)
+	{
+		elog(LOG, "prepared transaction ID %u was not visited in the CLOG scan, cannot restore running-xacts from CLOG",
+			 prepared_xids[next_prepared_idx]);
+		Assert(false);
+		goto fail;
+	}
+
+	elog(LOG, "restored %d running xacts by scanning the CLOG; oldestXid=%u oldestActiveXid=%u nextXid %u",
+		 n_restored_xids, checkpoint->oldestXid, checkpoint->oldestActiveXid, XidFromFullTransactionId(checkpoint->nextXid));
+	*nxids = n_restored_xids;
+	*xids = restored_xids;
+	return true;
+
+ fail:
+	*nxids = 0;
+	*xids = NULL;
+	if (restored_xids)
+		pfree(restored_xids);
+	if (prepared_xids)
+		pfree(prepared_xids);
+	return false;
+}
+
 void
 _PG_init(void)
 {
@@ -288,6 +579,8 @@ _PG_init(void)
 
 	pg_init_extension_server();
 
+	restore_running_xacts_callback = RestoreRunningXactsFromClog;
+
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index da1a6f76f0a2..944b316344dd 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -63,6 +63,8 @@ char	   *wal_acceptors_list = "";
 int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
 
+/* Set to true in the walproposer bgw. */
+static bool am_walproposer;
 static WalproposerShmemState *walprop_shared;
 static WalProposerConfig walprop_config;
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;
@@ -76,6 +78,7 @@ static HotStandbyFeedback agg_hs_feedback;
 
 static void nwp_shmem_startup_hook(void);
 static void nwp_register_gucs(void);
+static void assign_neon_safekeepers(const char *newval, void *extra);
 static void nwp_prepare_shmem(void);
 static uint64 backpressure_lag_impl(void);
 static bool backpressure_throttling_impl(void);
@@ -111,7 +114,8 @@ init_walprop_config(bool syncSafekeepers)
 {
 	walprop_config.neon_tenant = neon_tenant;
 	walprop_config.neon_timeline = neon_timeline;
-	walprop_config.safekeepers_list = wal_acceptors_list;
+	/* WalProposerCreate scribbles directly on it, so pstrdup */
+	walprop_config.safekeepers_list = pstrdup(wal_acceptors_list);
 	walprop_config.safekeeper_reconnect_timeout = wal_acceptor_reconnect_timeout;
 	walprop_config.safekeeper_connection_timeout = wal_acceptor_connection_timeout;
 	walprop_config.wal_segment_size = wal_segment_size;
@@ -151,6 +155,7 @@ WalProposerMain(Datum main_arg)
 
 	init_walprop_config(false);
 	walprop_pg_init_bgworker();
+	am_walproposer = true;
 	walprop_pg_load_libpqwalreceiver();
 
 	wp = WalProposerCreate(&walprop_config, walprop_pg);
@@ -189,10 +194,10 @@ nwp_register_gucs(void)
 							   NULL,	/* long_desc */
 							   &wal_acceptors_list, /* valueAddr */
 							   "",	/* bootValue */
-							   PGC_POSTMASTER,
+							   PGC_SIGHUP,
 							   GUC_LIST_INPUT,	/* extensions can't use*
 												 * GUC_LIST_QUOTE */
-							   NULL, NULL, NULL);
+							   NULL, assign_neon_safekeepers, NULL);
 
 	DefineCustomIntVariable(
 							"neon.safekeeper_reconnect_timeout",
@@ -215,6 +220,33 @@ nwp_register_gucs(void)
 							NULL, NULL, NULL);
 }
 
+/*
+ * GUC assign_hook for neon.safekeepers. Restarts walproposer through FATAL if
+ * the list changed.
+ */
+static void
+assign_neon_safekeepers(const char *newval, void *extra)
+{
+	if (!am_walproposer)
+		return;
+
+	if (!newval) {
+		/* should never happen */
+		wpg_log(FATAL, "neon.safekeepers is empty");
+	}
+
+	/* 
+	 * TODO: restarting through FATAL is stupid and introduces 1s delay before
+	 * next bgw start. We should refactor walproposer to allow graceful exit and
+	 * thus remove this delay.
+	 */
+	if (strcmp(wal_acceptors_list, newval) != 0)
+	{
+		wpg_log(FATAL, "restarting walproposer to change safekeeper list from %s to %s",
+				wal_acceptors_list, newval);
+	}
+}
+
 /*  Check if we need to suspend inserts because of lagging replication. */
 static uint64
 backpressure_lag_impl(void)
@@ -363,7 +395,7 @@ walprop_register_bgworker(void)
 	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "WalProposerMain");
 	snprintf(bgw.bgw_name, BGW_MAXLEN, "WAL proposer");
 	snprintf(bgw.bgw_type, BGW_MAXLEN, "WAL proposer");
-	bgw.bgw_restart_time = 5;
+	bgw.bgw_restart_time = 1;
 	bgw.bgw_notify_pid = 0;
 	bgw.bgw_main_arg = (Datum) 0;
 
@@ -1639,6 +1671,18 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32
 		late_cv_trigger = ConditionVariableCancelSleep();
 #endif
 
+	/*
+	 * Process config if requested. This restarts walproposer if safekeepers
+	 * list changed. Don't do that for sync-safekeepers because quite probably
+	 * it (re-reading config) won't work without some effort, and
+	 * sync-safekeepers should be quick to finish anyway.
+	 */
+	if (!wp->config->syncSafekeepers && ConfigReloadPending)
+	{
+		ConfigReloadPending = false;
+		ProcessConfigFile(PGC_SIGHUP);
+	}
+
 	/*
 	 * If wait is terminated by latch set (walsenders' latch is set on each
 	 * wal flush). (no need for pm death check due to WL_EXIT_ON_PM_DEATH)
diff --git a/pgxn/neon_test_utils/Makefile b/pgxn/neon_test_utils/Makefile
index 1ee87357e5e2..252810b5b02e 100644
--- a/pgxn/neon_test_utils/Makefile
+++ b/pgxn/neon_test_utils/Makefile
@@ -7,7 +7,7 @@ OBJS = \
 	neontest.o
 
 EXTENSION = neon_test_utils
-DATA = neon_test_utils--1.1.sql
+DATA = neon_test_utils--1.3.sql
 PGFILEDESC = "neon_test_utils - helpers for neon testing and debugging"
 
 PG_CONFIG = pg_config
diff --git a/pgxn/neon_test_utils/neon_test_utils--1.1.sql b/pgxn/neon_test_utils/neon_test_utils--1.3.sql
similarity index 74%
rename from pgxn/neon_test_utils/neon_test_utils--1.1.sql
rename to pgxn/neon_test_utils/neon_test_utils--1.3.sql
index 534784f31912..3b8794a8cff4 100644
--- a/pgxn/neon_test_utils/neon_test_utils--1.1.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.3.sql
@@ -41,7 +41,25 @@ RETURNS bytea
 AS 'MODULE_PATHNAME', 'get_raw_page_at_lsn_ex'
 LANGUAGE C PARALLEL UNSAFE;
 
-CREATE FUNCTION neon_xlogflush(lsn pg_lsn)
+CREATE FUNCTION neon_xlogflush(lsn pg_lsn DEFAULT NULL)
 RETURNS VOID
 AS 'MODULE_PATHNAME', 'neon_xlogflush'
 LANGUAGE C PARALLEL UNSAFE;
+
+CREATE FUNCTION trigger_panic()
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'trigger_panic'
+LANGUAGE C PARALLEL UNSAFE;
+
+CREATE FUNCTION trigger_segfault()
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'trigger_segfault'
+LANGUAGE C PARALLEL UNSAFE;
+
+-- Alias for `trigger_segfault`, just because `SELECT 💣()` looks fun
+CREATE OR REPLACE FUNCTION 💣() RETURNS void
+LANGUAGE plpgsql AS $$
+BEGIN
+    PERFORM trigger_segfault();
+END;
+$$;
diff --git a/pgxn/neon_test_utils/neon_test_utils.control b/pgxn/neon_test_utils/neon_test_utils.control
index 5f6d64083591..f22afd70c4fa 100644
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -1,6 +1,6 @@
 # neon_test_utils extension
 comment = 'helpers for neon testing and debugging'
-default_version = '1.1'
+default_version = '1.3'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
 trusted = true
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index 47f245fbf1af..650ef7405d64 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -15,6 +15,7 @@
 #include "access/relation.h"
 #include "access/xact.h"
 #include "access/xlog.h"
+#include "access/xlog_internal.h"
 #include "catalog/namespace.h"
 #include "fmgr.h"
 #include "funcapi.h"
@@ -41,6 +42,8 @@ PG_FUNCTION_INFO_V1(clear_buffer_cache);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
 PG_FUNCTION_INFO_V1(neon_xlogflush);
+PG_FUNCTION_INFO_V1(trigger_panic);
+PG_FUNCTION_INFO_V1(trigger_segfault);
 
 /*
  * Linkage to functions in neon module.
@@ -444,12 +447,68 @@ get_raw_page_at_lsn_ex(PG_FUNCTION_ARGS)
 
 /*
  * Directly calls XLogFlush(lsn) to flush WAL buffers.
+ *
+ * If 'lsn' is not specified (is NULL), flush all generated WAL.
  */
 Datum
 neon_xlogflush(PG_FUNCTION_ARGS)
 {
-	XLogRecPtr	lsn = PG_GETARG_LSN(0);
+	XLogRecPtr	lsn;
+
+	if (RecoveryInProgress())
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("recovery is in progress"),
+				 errhint("cannot flush WAL during recovery.")));
+
+	if (!PG_ARGISNULL(0))
+		lsn = PG_GETARG_LSN(0);
+	else
+	{
+		lsn = GetXLogInsertRecPtr();
+
+		/*---
+		 * The LSN returned by GetXLogInsertRecPtr() is the position where the
+		 * next inserted record would begin. If the last record ended just at
+		 * the page boundary, the next record will begin after the page header
+		 * on the next page, but the next page's page header has not been
+		 * written yet. If we tried to flush it, XLogFlush() would throw an
+		 * error:
+		 *
+		 * ERROR : xlog flush request %X/%X is not satisfied --- flushed only to %X/%X
+		 *
+		 * To avoid that, if the insert position points to just after the page
+		 * header, back off to page boundary.
+		 */
+		if (lsn % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
+			XLogSegmentOffset(lsn, wal_segment_size) > XLOG_BLCKSZ)
+			lsn -= SizeOfXLogShortPHD;
+		else if (lsn % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
+				 XLogSegmentOffset(lsn, wal_segment_size) < XLOG_BLCKSZ)
+			lsn -= SizeOfXLogLongPHD;
+	}
 
 	XLogFlush(lsn);
 	PG_RETURN_VOID();
 }
+
+/*
+ * Function to trigger panic.
+ */
+Datum
+trigger_panic(PG_FUNCTION_ARGS)
+{
+    elog(PANIC, "neon_test_utils: panic");
+    PG_RETURN_VOID();
+}
+
+/*
+ * Function to trigger a segfault.
+ */
+Datum
+trigger_segfault(PG_FUNCTION_ARGS)
+{
+    int *ptr = NULL;
+    *ptr = 42;
+    PG_RETURN_VOID();
+}
diff --git a/poetry.lock b/poetry.lock
index 7740388fb8be..809114141188 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -734,13 +734,13 @@ typing-extensions = ">=4.1.0"
 
 [[package]]
 name = "certifi"
-version = "2023.7.22"
+version = "2024.7.4"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
-    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
+    {file = "certifi-2024.7.4-py3-none-any.whl", hash = "sha256:c198e21b1289c2ab85ee4e67bb4b4ef3ead0892059901a8d5b622f24a1101e90"},
+    {file = "certifi-2024.7.4.tar.gz", hash = "sha256:5a1e7645bc0ec61a09e26c36f6106dd4cf40c6db3a1fb6352b0244e7fb057c7b"},
 ]
 
 [[package]]
@@ -3133,18 +3133,18 @@ multidict = ">=4.0"
 
 [[package]]
 name = "zipp"
-version = "3.8.1"
+version = "3.19.1"
 description = "Backport of pathlib-compatible object wrapper for zip files"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "zipp-3.8.1-py3-none-any.whl", hash = "sha256:47c40d7fe183a6f21403a199b3e4192cca5774656965b0a4988ad2f8feb5f009"},
-    {file = "zipp-3.8.1.tar.gz", hash = "sha256:05b45f1ee8f807d0cc928485ca40a07cb491cf092ff587c0df9cb1fd154848d2"},
+    {file = "zipp-3.19.1-py3-none-any.whl", hash = "sha256:2828e64edb5386ea6a52e7ba7cdb17bb30a73a858f5eb6eb93d8d36f5ea26091"},
+    {file = "zipp-3.19.1.tar.gz", hash = "sha256:35427f6d5594f4acf82d25541438348c26736fa9b3afa2754bcd63cdb99d8e8f"},
 ]
 
 [package.extras]
-docs = ["jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx"]
-testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)"]
+doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"]
+test = ["big-O", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy", "pytest-ruff (>=0.2.1)"]
 
 [[package]]
 name = "zstandard"
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 44e880838e07..d7a3eb9a4d18 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -216,10 +216,11 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
     use pq_proto::FeStartupPacket::*;
 
     match msg {
-        SslRequest => {
+        SslRequest { direct: false } => {
             stream
                 .write_message(&pq_proto::BeMessage::EncryptionResponse(true))
                 .await?;
+
             // Upgrade raw stream into a secure TLS-backed stream.
             // NOTE: We've consumed `tls`; this fact will be used later.
 
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index dffebf55800c..7f4cb2c0100c 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -35,6 +35,7 @@ use proxy::usage_metrics;
 use anyhow::bail;
 use proxy::config::{self, ProxyConfig};
 use proxy::serverless;
+use remote_storage::RemoteStorageConfig;
 use std::net::SocketAddr;
 use std::pin::pin;
 use std::sync::Arc;
@@ -205,8 +206,8 @@ struct ProxyCliArgs {
     /// remote storage configuration for backup metric collection
     /// Encoded as toml (same format as pageservers), eg
     /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
-    #[clap(long, default_value = "{}")]
-    metric_backup_collection_remote_storage: String,
+    #[clap(long, value_parser = remote_storage_from_toml)]
+    metric_backup_collection_remote_storage: Option<RemoteStorageConfig>,
     /// chunk size for backup metric collection
     /// Size of each event is no more than 400 bytes, so 2**22 is about 200MB before the compression.
     #[clap(long, default_value = "4194304")]
@@ -511,9 +512,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
     }
     let backup_metric_collection_config = config::MetricBackupCollectionConfig {
         interval: args.metric_backup_collection_interval,
-        remote_storage_config: remote_storage_from_toml(
-            &args.metric_backup_collection_remote_storage,
-        )?,
+        remote_storage_config: args.metric_backup_collection_remote_storage.clone(),
         chunk_size: args.metric_backup_collection_chunk_size,
     };
 
diff --git a/proxy/src/cache/common.rs b/proxy/src/cache/common.rs
index bc1c37512bce..4e393fddb2aa 100644
--- a/proxy/src/cache/common.rs
+++ b/proxy/src/cache/common.rs
@@ -53,6 +53,13 @@ impl<C: Cache, V> Cached<C, V> {
         )
     }
 
+    pub fn map<U>(self, f: impl FnOnce(V) -> U) -> Cached<C, U> {
+        Cached {
+            token: self.token,
+            value: f(self.value),
+        }
+    }
+
     /// Drop this entry from a cache if it's still there.
     pub fn invalidate(self) -> V {
         if let Some((cache, info)) = &self.token {
diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 3b21381bb971..c5c4f6a1ed09 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -65,6 +65,8 @@ impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
 struct Entry<T> {
     created_at: Instant,
     expires_at: Instant,
+    ttl: Duration,
+    update_ttl_on_retrieval: bool,
     value: T,
 }
 
@@ -122,7 +124,6 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
         Q: Hash + Eq + ?Sized,
     {
         let now = Instant::now();
-        let deadline = now.checked_add(self.ttl).expect("time overflow");
 
         // Do costly things before taking the lock.
         let mut cache = self.cache.lock();
@@ -142,7 +143,8 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
         let (created_at, expires_at) = (entry.created_at, entry.expires_at);
 
         // Update the deadline and the entry's position in the LRU list.
-        if self.update_ttl_on_retrieval {
+        let deadline = now.checked_add(raw_entry.get().ttl).expect("time overflow");
+        if raw_entry.get().update_ttl_on_retrieval {
             raw_entry.get_mut().expires_at = deadline;
         }
         raw_entry.to_back();
@@ -162,12 +164,27 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
     /// existed, return the previous value and its creation timestamp.
     #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
     fn insert_raw(&self, key: K, value: V) -> (Instant, Option<V>) {
+        self.insert_raw_ttl(key, value, self.ttl, self.update_ttl_on_retrieval)
+    }
+
+    /// Insert an entry to the cache. If an entry with the same key already
+    /// existed, return the previous value and its creation timestamp.
+    #[tracing::instrument(level = "debug", fields(cache = self.name), skip_all)]
+    fn insert_raw_ttl(
+        &self,
+        key: K,
+        value: V,
+        ttl: Duration,
+        update: bool,
+    ) -> (Instant, Option<V>) {
         let created_at = Instant::now();
-        let expires_at = created_at.checked_add(self.ttl).expect("time overflow");
+        let expires_at = created_at.checked_add(ttl).expect("time overflow");
 
         let entry = Entry {
             created_at,
             expires_at,
+            ttl,
+            update_ttl_on_retrieval: update,
             value,
         };
 
@@ -190,6 +207,21 @@ impl<K: Hash + Eq, V> TimedLru<K, V> {
 }
 
 impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
+    pub fn insert_ttl(&self, key: K, value: V, ttl: Duration) {
+        self.insert_raw_ttl(key, value, ttl, false);
+    }
+
+    pub fn insert_unit(&self, key: K, value: V) -> (Option<V>, Cached<&Self, ()>) {
+        let (created_at, old) = self.insert_raw(key.clone(), value);
+
+        let cached = Cached {
+            token: Some((self, LookupInfo { created_at, key })),
+            value: (),
+        };
+
+        (old, cached)
+    }
+
     pub fn insert(&self, key: K, value: V) -> (Option<V>, Cached<&Self>) {
         let (created_at, old) = self.insert_raw(key.clone(), value.clone());
 
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index f4707a33aa79..650491976053 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -75,6 +75,9 @@ impl TlsConfig {
     }
 }
 
+/// <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/include/libpq/pqcomm.h#L159>
+pub const PG_ALPN_PROTOCOL: &[u8] = b"postgresql";
+
 /// Configure TLS for the main endpoint.
 pub fn configure_tls(
     key_path: &str,
@@ -111,16 +114,17 @@ pub fn configure_tls(
     let cert_resolver = Arc::new(cert_resolver);
 
     // allow TLS 1.2 to be compatible with older client libraries
-    let config = rustls::ServerConfig::builder_with_protocol_versions(&[
+    let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[
         &rustls::version::TLS13,
         &rustls::version::TLS12,
     ])
     .with_no_client_auth()
-    .with_cert_resolver(cert_resolver.clone())
-    .into();
+    .with_cert_resolver(cert_resolver.clone());
+
+    config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()];
 
     Ok(TlsConfig {
-        config,
+        config: Arc::new(config),
         common_names,
         cert_resolver,
     })
@@ -399,15 +403,11 @@ impl FromStr for EndpointCacheConfig {
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
     pub interval: Duration,
-    pub remote_storage_config: OptRemoteStorageConfig,
+    pub remote_storage_config: Option<RemoteStorageConfig>,
     pub chunk_size: usize,
 }
 
-/// Hack to avoid clap being smarter. If you don't use this type alias, clap assumes more about the optional state and you get
-/// runtime type errors from the value parser we use.
-pub type OptRemoteStorageConfig = Option<RemoteStorageConfig>;
-
-pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<OptRemoteStorageConfig> {
+pub fn remote_storage_from_toml(s: &str) -> anyhow::Result<RemoteStorageConfig> {
     RemoteStorageConfig::from_toml(&s.parse()?)
 }
 
diff --git a/proxy/src/console/messages.rs b/proxy/src/console/messages.rs
index d28d13ba692b..9abf24ab7ffa 100644
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -9,7 +9,7 @@ use crate::proxy::retry::CouldRetry;
 
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct ConsoleError {
     pub error: Box<str>,
     #[serde(skip)]
@@ -82,41 +82,19 @@ impl CouldRetry for ConsoleError {
             .details
             .error_info
             .map_or(Reason::Unknown, |e| e.reason);
-        match reason {
-            // not a transitive error
-            Reason::RoleProtected => false,
-            // on retry, it will still not be found
-            Reason::ResourceNotFound
-            | Reason::ProjectNotFound
-            | Reason::EndpointNotFound
-            | Reason::BranchNotFound => false,
-            // we were asked to go away
-            Reason::RateLimitExceeded
-            | Reason::NonDefaultBranchComputeTimeExceeded
-            | Reason::ActiveTimeQuotaExceeded
-            | Reason::ComputeTimeQuotaExceeded
-            | Reason::WrittenDataQuotaExceeded
-            | Reason::DataTransferQuotaExceeded
-            | Reason::LogicalSizeQuotaExceeded => false,
-            // transitive error. control plane is currently busy
-            // but might be ready soon
-            Reason::RunningOperations => true,
-            Reason::ConcurrencyLimitReached => true,
-            Reason::LockAlreadyTaken => true,
-            // unknown error. better not retry it.
-            Reason::Unknown => false,
-        }
+
+        reason.can_retry()
     }
 }
 
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct Status {
     pub code: Box<str>,
     pub message: Box<str>,
     pub details: Details,
 }
 
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct Details {
     pub error_info: Option<ErrorInfo>,
     pub retry_info: Option<RetryInfo>,
@@ -199,6 +177,34 @@ impl Reason {
                 | Reason::BranchNotFound
         )
     }
+
+    pub fn can_retry(&self) -> bool {
+        match self {
+            // do not retry role protected errors
+            // not a transitive error
+            Reason::RoleProtected => false,
+            // on retry, it will still not be found
+            Reason::ResourceNotFound
+            | Reason::ProjectNotFound
+            | Reason::EndpointNotFound
+            | Reason::BranchNotFound => false,
+            // we were asked to go away
+            Reason::RateLimitExceeded
+            | Reason::NonDefaultBranchComputeTimeExceeded
+            | Reason::ActiveTimeQuotaExceeded
+            | Reason::ComputeTimeQuotaExceeded
+            | Reason::WrittenDataQuotaExceeded
+            | Reason::DataTransferQuotaExceeded
+            | Reason::LogicalSizeQuotaExceeded => false,
+            // transitive error. control plane is currently busy
+            // but might be ready soon
+            Reason::RunningOperations
+            | Reason::ConcurrencyLimitReached
+            | Reason::LockAlreadyTaken => true,
+            // unknown error. better not retry it.
+            Reason::Unknown => false,
+        }
+    }
 }
 
 #[derive(Copy, Clone, Debug, Deserialize)]
@@ -206,7 +212,7 @@ pub struct RetryInfo {
     pub retry_delay_ms: u64,
 }
 
-#[derive(Debug, Deserialize)]
+#[derive(Debug, Deserialize, Clone)]
 pub struct UserFacingMessage {
     pub message: Box<str>,
 }
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index c7a2d467c016..befe7d75104b 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -6,8 +6,9 @@ use anyhow::Context;
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use std::{convert::Infallible, future};
+use std::convert::Infallible;
 use tokio::net::{TcpListener, TcpStream};
+use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};
 
 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
@@ -67,7 +68,9 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
 
 async fn handle_connection(socket: TcpStream) -> Result<(), QueryError> {
     let pgbackend = PostgresBackend::new(socket, AuthType::Trust, None)?;
-    pgbackend.run(&mut MgmtHandler, future::pending::<()>).await
+    pgbackend
+        .run(&mut MgmtHandler, &CancellationToken::new())
+        .await
 }
 
 /// A message received by `mgmt` when a compute node is ready.
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index bec55a83435f..7a9637066fb1 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -2,7 +2,7 @@
 pub mod mock;
 pub mod neon;
 
-use super::messages::MetricsAuxInfo;
+use super::messages::{ConsoleError, MetricsAuxInfo};
 use crate::{
     auth::{
         backend::{ComputeCredentialKeys, ComputeUserInfo},
@@ -317,8 +317,8 @@ impl NodeInfo {
     }
 }
 
-pub type NodeInfoCache = TimedLru<EndpointCacheKey, NodeInfo>;
-pub type CachedNodeInfo = Cached<&'static NodeInfoCache>;
+pub type NodeInfoCache = TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ConsoleError>>>;
+pub type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
 pub type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
 pub type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
 
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 41bd2f49567e..a6e67be22f13 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -9,7 +9,7 @@ use super::{
 use crate::{
     auth::backend::ComputeUserInfo,
     compute,
-    console::messages::ColdStartInfo,
+    console::messages::{ColdStartInfo, Reason},
     http,
     metrics::{CacheOutcome, Metrics},
     rate_limiter::EndpointRateLimiter,
@@ -17,10 +17,10 @@ use crate::{
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use futures::TryFutureExt;
-use std::sync::Arc;
+use std::{sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{debug, error, info, info_span, warn, Instrument};
 
 pub struct Api {
     endpoint: http::Endpoint,
@@ -273,26 +273,34 @@ impl super::Api for Api {
     ) -> Result<CachedNodeInfo, WakeComputeError> {
         let key = user_info.endpoint_cache_key();
 
+        macro_rules! check_cache {
+            () => {
+                if let Some(cached) = self.caches.node_info.get(&key) {
+                    let (cached, info) = cached.take_value();
+                    let info = info.map_err(|c| {
+                        info!(key = &*key, "found cached wake_compute error");
+                        WakeComputeError::ApiError(ApiError::Console(*c))
+                    })?;
+
+                    debug!(key = &*key, "found cached compute node info");
+                    ctx.set_project(info.aux.clone());
+                    return Ok(cached.map(|()| info));
+                }
+            };
+        }
+
         // Every time we do a wakeup http request, the compute node will stay up
         // for some time (highly depends on the console's scale-to-zero policy);
         // The connection info remains the same during that period of time,
         // which means that we might cache it to reduce the load and latency.
-        if let Some(cached) = self.caches.node_info.get(&key) {
-            info!(key = &*key, "found cached compute node info");
-            ctx.set_project(cached.aux.clone());
-            return Ok(cached);
-        }
+        check_cache!();
 
         let permit = self.locks.get_permit(&key).await?;
 
         // after getting back a permit - it's possible the cache was filled
         // double check
         if permit.should_check_cache() {
-            if let Some(cached) = self.caches.node_info.get(&key) {
-                info!(key = &*key, "found cached compute node info");
-                ctx.set_project(cached.aux.clone());
-                return Ok(cached);
-            }
+            check_cache!();
         }
 
         // check rate limit
@@ -300,23 +308,56 @@ impl super::Api for Api {
             .wake_compute_endpoint_rate_limiter
             .check(user_info.endpoint.normalize_intern(), 1)
         {
-            info!(key = &*key, "found cached compute node info");
             return Err(WakeComputeError::TooManyConnections);
         }
 
-        let mut node = permit.release_result(self.do_wake_compute(ctx, user_info).await)?;
-        ctx.set_project(node.aux.clone());
-        let cold_start_info = node.aux.cold_start_info;
-        info!("woken up a compute node");
+        let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
+        match node {
+            Ok(node) => {
+                ctx.set_project(node.aux.clone());
+                debug!(key = &*key, "created a cache entry for woken compute node");
 
-        // store the cached node as 'warm'
-        node.aux.cold_start_info = ColdStartInfo::WarmCached;
-        let (_, mut cached) = self.caches.node_info.insert(key.clone(), node);
-        cached.aux.cold_start_info = cold_start_info;
+                let mut stored_node = node.clone();
+                // store the cached node as 'warm_cached'
+                stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
 
-        info!(key = &*key, "created a cache entry for compute node info");
+                let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
 
-        Ok(cached)
+                Ok(cached.map(|()| node))
+            }
+            Err(err) => match err {
+                WakeComputeError::ApiError(ApiError::Console(err)) => {
+                    let Some(status) = &err.status else {
+                        return Err(WakeComputeError::ApiError(ApiError::Console(err)));
+                    };
+
+                    let reason = status
+                        .details
+                        .error_info
+                        .map_or(Reason::Unknown, |x| x.reason);
+
+                    // if we can retry this error, do not cache it.
+                    if reason.can_retry() {
+                        return Err(WakeComputeError::ApiError(ApiError::Console(err)));
+                    }
+
+                    // at this point, we should only have quota errors.
+                    debug!(
+                        key = &*key,
+                        "created a cache entry for the wake compute error"
+                    );
+
+                    self.caches.node_info.insert_ttl(
+                        key,
+                        Err(Box::new(err.clone())),
+                        Duration::from_secs(30),
+                    );
+
+                    Err(WakeComputeError::ApiError(ApiError::Console(err)))
+                }
+                err => return Err(err),
+            },
+        }
     }
 }
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index e72bf199e362..cfc1f8e89e3f 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -14,17 +14,14 @@ use parquet::{
     record::RecordWriter,
 };
 use pq_proto::StartupMessageParams;
-use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
+use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
 use serde::ser::SerializeMap;
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;
 
-use crate::{
-    config::{remote_storage_from_toml, OptRemoteStorageConfig},
-    context::LOG_CHAN_DISCONNECT,
-};
+use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT};
 
 use super::{RequestMonitoring, LOG_CHAN};
 
@@ -33,11 +30,11 @@ pub struct ParquetUploadArgs {
     /// Storage location to upload the parquet files to.
     /// Encoded as toml (same format as pageservers), eg
     /// `{bucket_name='the-bucket',bucket_region='us-east-1',prefix_in_bucket='proxy',endpoint='http://minio:9000'}`
-    #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
-    parquet_upload_remote_storage: OptRemoteStorageConfig,
+    #[clap(long, value_parser = remote_storage_from_toml)]
+    parquet_upload_remote_storage: Option<RemoteStorageConfig>,
 
-    #[clap(long, default_value = "{}", value_parser = remote_storage_from_toml)]
-    parquet_upload_disconnect_events_remote_storage: OptRemoteStorageConfig,
+    #[clap(long, value_parser = remote_storage_from_toml)]
+    parquet_upload_disconnect_events_remote_storage: Option<RemoteStorageConfig>,
 
     /// How many rows to include in a row group
     #[clap(long, default_value_t = 8192)]
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index fc7400869ffa..dd7164181d94 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -4,14 +4,11 @@
 
 pub mod health_server;
 
-use std::{str::FromStr, sync::Arc, time::Duration};
+use std::time::Duration;
 
-use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
-use tokio::time::Instant;
-use tracing::trace;
 
 use crate::{
     metrics::{ConsoleRequest, Metrics},
@@ -24,8 +21,6 @@ use reqwest_middleware::RequestBuilder;
 /// We deliberately don't want to replace this with a public static.
 pub fn new_client() -> ClientWithMiddleware {
     let client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
-        .connection_verbose(true)
         .build()
         .expect("Failed to create http client");
 
@@ -36,8 +31,6 @@ pub fn new_client() -> ClientWithMiddleware {
 
 pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
     let timeout_client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
-        .connection_verbose(true)
         .timeout(default_timout)
         .build()
         .expect("Failed to create http client with timeout");
@@ -103,38 +96,6 @@ impl Endpoint {
     }
 }
 
-use hyper_util::client::legacy::connect::dns::{
-    GaiResolver as HyperGaiResolver, Name as HyperName,
-};
-use reqwest::dns::{Addrs, Name, Resolve, Resolving};
-/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
-use tower_service::Service;
-#[derive(Debug)]
-pub struct GaiResolver(HyperGaiResolver);
-
-impl Default for GaiResolver {
-    fn default() -> Self {
-        Self(HyperGaiResolver::new())
-    }
-}
-
-impl Resolve for GaiResolver {
-    fn resolve(&self, name: Name) -> Resolving {
-        let this = &mut self.0.clone();
-        let hyper_name = HyperName::from_str(name.as_str()).expect("name should be valid");
-        let start = Instant::now();
-        Box::pin(
-            Service::<HyperName>::call(this, hyper_name).map(move |result| {
-                let resolve_duration = start.elapsed();
-                trace!(duration = ?resolve_duration, addr = %name.as_str(), "resolve host complete");
-                result
-                    .map(|addrs| -> Addrs { Box::new(addrs) })
-                    .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
-            }),
-        )
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs
index 3243e6a14010..d307d80f4af9 100644
--- a/proxy/src/jemalloc.rs
+++ b/proxy/src/jemalloc.rs
@@ -3,8 +3,8 @@ use std::marker::PhantomData;
 use measured::{
     label::NoLabels,
     metric::{
-        gauge::GaugeState, group::Encoding, group::MetricValue, name::MetricNameEncoder,
-        MetricEncoding, MetricFamilyEncoding, MetricType,
+        gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding,
+        MetricFamilyEncoding, MetricType,
     },
     text::TextEncoder,
     LabelGroup, MetricGroup,
@@ -100,7 +100,7 @@ macro_rules! jemalloc_gauge {
                 enc: &mut TextEncoder<W>,
             ) -> Result<(), std::io::Error> {
                 if let Ok(v) = mib.read() {
-                    enc.write_metric_value(name, labels, MetricValue::Int(v as i64))?;
+                    GaugeState::new(v as i64).collect_into(&(), labels, name, enc)?;
                 }
                 Ok(())
             }
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 3405b8cbc672..3b30ad8b4663 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -15,7 +15,8 @@ use tracing_subscriber::{
 pub async fn init() -> anyhow::Result<LoggingGuard> {
     let env_filter = EnvFilter::builder()
         .with_default_directive(LevelFilter::INFO.into())
-        .from_env_lossy();
+        .from_env_lossy()
+        .add_directive("azure_core::policies::transport=off".parse().unwrap());
 
     let fmt_layer = tracing_subscriber::fmt::layer()
         .with_ansi(false)
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index e2a75a872009..db25ac031115 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -2,7 +2,7 @@ use std::sync::{Arc, OnceLock};
 
 use lasso::ThreadedRodeo;
 use measured::{
-    label::{FixedCardinalitySet, LabelName, LabelSet, LabelValue, StaticLabelSet},
+    label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet},
     metric::{histogram::Thresholds, name::MetricName},
     Counter, CounterVec, FixedCardinalityLabel, Gauge, GaugeVec, Histogram, HistogramVec,
     LabelGroup, MetricGroup,
@@ -577,6 +577,32 @@ impl LabelGroup for ThreadPoolWorkerId {
     }
 }
 
+impl LabelGroupSet for ThreadPoolWorkers {
+    type Group<'a> = ThreadPoolWorkerId;
+
+    fn cardinality(&self) -> Option<usize> {
+        Some(self.0)
+    }
+
+    fn encode_dense(&self, value: Self::Unique) -> Option<usize> {
+        Some(value)
+    }
+
+    fn decode_dense(&self, value: usize) -> Self::Group<'_> {
+        ThreadPoolWorkerId(value)
+    }
+
+    type Unique = usize;
+
+    fn encode(&self, value: Self::Group<'_>) -> Option<Self::Unique> {
+        Some(value.0)
+    }
+
+    fn decode(&self, value: &Self::Unique) -> Self::Group<'_> {
+        ThreadPoolWorkerId(*value)
+    }
+}
+
 impl LabelSet for ThreadPoolWorkers {
     type Value<'a> = ThreadPoolWorkerId;
 
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index dd935cc24528..d488aea9275b 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -1,11 +1,17 @@
-use pq_proto::{BeMessage as Be, CancelKeyData, FeStartupPacket, StartupMessageParams};
+use bytes::Buf;
+use pq_proto::{
+    framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion,
+    StartupMessageParams,
+};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::info;
+use tracing::{info, warn};
 
 use crate::{
-    config::TlsConfig,
+    auth::endpoint_sni,
+    config::{TlsConfig, PG_ALPN_PROTOCOL},
     error::ReportableError,
+    metrics::Metrics,
     proxy::ERR_INSECURE_CONNECTION,
     stream::{PqStream, Stream, StreamUpgradeError},
 };
@@ -68,6 +74,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
     // Client may try upgrading to each protocol only once
     let (mut tried_ssl, mut tried_gss) = (false, false);
 
+    const PG_PROTOCOL_EARLIEST: ProtocolVersion = ProtocolVersion::new(3, 0);
+    const PG_PROTOCOL_LATEST: ProtocolVersion = ProtocolVersion::new(3, 0);
+
     let mut stream = PqStream::new(Stream::from_raw(stream));
     loop {
         let msg = stream.read_startup_packet().await?;
@@ -75,40 +84,96 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
 
         use FeStartupPacket::*;
         match msg {
-            SslRequest => match stream.get_ref() {
+            SslRequest { direct } => match stream.get_ref() {
                 Stream::Raw { .. } if !tried_ssl => {
                     tried_ssl = true;
 
                     // We can't perform TLS handshake without a config
-                    let enc = tls.is_some();
-                    stream.write_message(&Be::EncryptionResponse(enc)).await?;
+                    let have_tls = tls.is_some();
+                    if !direct {
+                        stream
+                            .write_message(&Be::EncryptionResponse(have_tls))
+                            .await?;
+                    } else if !have_tls {
+                        return Err(HandshakeError::ProtocolViolation);
+                    }
+
                     if let Some(tls) = tls.take() {
                         // Upgrade raw stream into a secure TLS-backed stream.
                         // NOTE: We've consumed `tls`; this fact will be used later.
 
-                        let (raw, read_buf) = stream.into_inner();
-                        // TODO: Normally, client doesn't send any data before
-                        // server says TLS handshake is ok and read_buf is empy.
-                        // However, you could imagine pipelining of postgres
-                        // SSLRequest + TLS ClientHello in one hunk similar to
-                        // pipelining in our node js driver. We should probably
-                        // support that by chaining read_buf with the stream.
+                        let Framed {
+                            stream: raw,
+                            read_buf,
+                            write_buf,
+                        } = stream.framed;
+
+                        let Stream::Raw { raw } = raw else {
+                            return Err(HandshakeError::StreamUpgradeError(
+                                StreamUpgradeError::AlreadyTls,
+                            ));
+                        };
+
+                        let mut read_buf = read_buf.reader();
+                        let mut res = Ok(());
+                        let accept = tokio_rustls::TlsAcceptor::from(tls.to_server_config())
+                            .accept_with(raw, |session| {
+                                // push the early data to the tls session
+                                while !read_buf.get_ref().is_empty() {
+                                    match session.read_tls(&mut read_buf) {
+                                        Ok(_) => {}
+                                        Err(e) => {
+                                            res = Err(e);
+                                            break;
+                                        }
+                                    }
+                                }
+                            });
+
+                        res?;
+
+                        let read_buf = read_buf.into_inner();
                         if !read_buf.is_empty() {
                             return Err(HandshakeError::EarlyData);
                         }
-                        let tls_stream = raw
-                            .upgrade(tls.to_server_config(), record_handshake_error)
-                            .await?;
+
+                        let tls_stream = accept.await.inspect_err(|_| {
+                            if record_handshake_error {
+                                Metrics::get().proxy.tls_handshake_failures.inc()
+                            }
+                        })?;
+
+                        let conn_info = tls_stream.get_ref().1;
+
+                        // check the ALPN, if exists, as required.
+                        match conn_info.alpn_protocol() {
+                            None | Some(PG_ALPN_PROTOCOL) => {}
+                            Some(other) => {
+                                // try parse ep for better error
+                                let ep = conn_info.server_name().and_then(|sni| {
+                                    endpoint_sni(sni, &tls.common_names).ok().flatten()
+                                });
+                                let alpn = String::from_utf8_lossy(other);
+                                warn!(?ep, %alpn, "unexpected ALPN");
+                                return Err(HandshakeError::ProtocolViolation);
+                            }
+                        }
 
                         let (_, tls_server_end_point) = tls
                             .cert_resolver
-                            .resolve(tls_stream.get_ref().1.server_name())
+                            .resolve(conn_info.server_name())
                             .ok_or(HandshakeError::MissingCertificate)?;
 
-                        stream = PqStream::new(Stream::Tls {
-                            tls: Box::new(tls_stream),
-                            tls_server_end_point,
-                        });
+                        stream = PqStream {
+                            framed: Framed {
+                                stream: Stream::Tls {
+                                    tls: Box::new(tls_stream),
+                                    tls_server_end_point,
+                                },
+                                read_buf,
+                                write_buf,
+                            },
+                        };
                     }
                 }
                 _ => return Err(HandshakeError::ProtocolViolation),
@@ -122,7 +187,9 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                 }
                 _ => return Err(HandshakeError::ProtocolViolation),
             },
-            StartupMessage { params, .. } => {
+            StartupMessage { params, version }
+                if PG_PROTOCOL_EARLIEST <= version && version <= PG_PROTOCOL_LATEST =>
+            {
                 // Check that the config has been consumed during upgrade
                 // OR we didn't provide it at all (for dev purposes).
                 if tls.is_some() {
@@ -131,9 +198,48 @@ pub async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                         .await?;
                 }
 
-                info!(session_type = "normal", "successful handshake");
+                info!(?version, session_type = "normal", "successful handshake");
                 break Ok(HandshakeData::Startup(stream, params));
             }
+            // downgrade protocol version
+            StartupMessage { params, version }
+                if version.major() == 3 && version > PG_PROTOCOL_LATEST =>
+            {
+                warn!(?version, "unsupported minor version");
+
+                // no protocol extensions are supported.
+                // <https://github.com/postgres/postgres/blob/ca481d3c9ab7bf69ff0c8d71ad3951d407f6a33c/src/backend/tcop/backend_startup.c#L744-L753>
+                let mut unsupported = vec![];
+                for (k, _) in params.iter() {
+                    if k.starts_with("_pq_.") {
+                        unsupported.push(k);
+                    }
+                }
+
+                // TODO: remove unsupported options so we don't send them to compute.
+
+                stream
+                    .write_message(&Be::NegotiateProtocolVersion {
+                        version: PG_PROTOCOL_LATEST,
+                        options: &unsupported,
+                    })
+                    .await?;
+
+                info!(
+                    ?version,
+                    session_type = "normal",
+                    "successful handshake; unsupported minor version requested"
+                );
+                break Ok(HandshakeData::Startup(stream, params));
+            }
+            StartupMessage { version, .. } => {
+                warn!(
+                    ?version,
+                    session_type = "normal",
+                    "unsuccessful handshake; unsupported version"
+                );
+                return Err(HandshakeError::ProtocolViolation);
+            }
             CancelRequest(cancel_key_data) => {
                 info!(session_type = "cancellation", "successful handshake");
                 break Ok(HandshakeData::Cancel(cancel_key_data));
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 8119f39fae6b..5186a9e1b0f1 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -540,8 +540,8 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
         },
         allow_self_signed_compute: false,
     };
-    let (_, node) = cache.insert("key".into(), node);
-    node
+    let (_, node2) = cache.insert_unit("key".into(), Ok(node.clone()));
+    node2.map(|()| node)
 }
 
 fn helper_create_connect_info(
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 583ff75f7ca7..8118ae5ea89d 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -838,8 +838,9 @@ async fn query_to_json<T: GenericClient>(
         "finished reading rows"
     );
 
-    let mut fields = vec![];
-    let mut columns = vec![];
+    let columns_len = row_stream.columns().len();
+    let mut fields = Vec::with_capacity(columns_len);
+    let mut columns = Vec::with_capacity(columns_len);
 
     for c in row_stream.columns() {
         fields.push(json!({
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 20650490b1ae..9eb6546d6bae 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -12,7 +12,6 @@ use sd_notify::NotifyState;
 use tokio::runtime::Handle;
 use tokio::signal::unix::{signal, SignalKind};
 use tokio::task::JoinError;
-use toml_edit::Document;
 use utils::logging::SecretString;
 
 use std::env::{var, VarError};
@@ -28,8 +27,9 @@ use utils::pid_file;
 
 use metrics::set_build_info_metric;
 use safekeeper::defaults::{
-    DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR,
-    DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
+    DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT,
+    DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES, DEFAULT_PARTIAL_BACKUP_CONCURRENCY,
+    DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
 };
 use safekeeper::http;
 use safekeeper::wal_service;
@@ -125,7 +125,7 @@ struct Args {
     peer_recovery: bool,
     /// Remote storage configuration for WAL backup (offloading to s3) as TOML
     /// inline table, e.g.
-    ///   {"max_concurrent_syncs" = 17, "max_sync_errors": 13, "bucket_name": "<BUCKETNAME>", "bucket_region":"<REGION>", "concurrency_limit": 119}
+    ///   {max_concurrent_syncs = 17, max_sync_errors = 13, bucket_name = "<BUCKETNAME>", bucket_region = "<REGION>", concurrency_limit = 119}
     /// Safekeeper offloads WAL to
     ///   [prefix_in_bucket/]<tenant_id>/<timeline_id>/<segment_file>, mirroring
     /// structure on the file system.
@@ -191,6 +191,15 @@ struct Args {
     /// Pending updates to control file will be automatically saved after this interval.
     #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_CONTROL_FILE_SAVE_INTERVAL)]
     control_file_save_interval: Duration,
+    /// Number of allowed concurrent uploads of partial segments to remote storage.
+    #[arg(long, default_value = DEFAULT_PARTIAL_BACKUP_CONCURRENCY)]
+    partial_backup_concurrency: usize,
+    /// How long a timeline must be resident before it is eligible for eviction.
+    /// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction,
+    /// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again,
+    /// if it weren't for `eviction_min_resident` preventing that.
+    #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
+    eviction_min_resident: Duration,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -344,6 +353,8 @@ async fn main() -> anyhow::Result<()> {
         enable_offload: args.enable_offload,
         delete_offloaded_wal: args.delete_offloaded_wal,
         control_file_save_interval: args.control_file_save_interval,
+        partial_backup_concurrency: args.partial_backup_concurrency,
+        eviction_min_resident: args.eviction_min_resident,
     };
 
     // initialize sentry if SENTRY_DSN is provided
@@ -441,6 +452,19 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
         .map(|res| ("WAL service main".to_owned(), res));
     tasks_handles.push(Box::pin(wal_service_handle));
 
+    let timeline_housekeeping_handle = current_thread_rt
+        .as_ref()
+        .unwrap_or_else(|| WAL_SERVICE_RUNTIME.handle())
+        .spawn(async move {
+            const TOMBSTONE_TTL: Duration = Duration::from_secs(3600 * 24);
+            loop {
+                tokio::time::sleep(TOMBSTONE_TTL).await;
+                GlobalTimelines::housekeeping(&TOMBSTONE_TTL);
+            }
+        })
+        .map(|res| ("Timeline map housekeeping".to_owned(), res));
+    tasks_handles.push(Box::pin(timeline_housekeeping_handle));
+
     if let Some(pg_listener_tenant_only) = pg_listener_tenant_only {
         let conf_ = conf.clone();
         let wal_service_handle = current_thread_rt
@@ -548,16 +572,8 @@ fn set_id(workdir: &Utf8Path, given_id: Option<NodeId>) -> Result<NodeId> {
     Ok(my_id)
 }
 
-// Parse RemoteStorage from TOML table.
 fn parse_remote_storage(storage_conf: &str) -> anyhow::Result<RemoteStorageConfig> {
-    // funny toml doesn't consider plain inline table as valid document, so wrap in a key to parse
-    let storage_conf_toml = format!("remote_storage = {storage_conf}");
-    let parsed_toml = storage_conf_toml.parse::<Document>()?; // parse
-    let (_, storage_conf_parsed_toml) = parsed_toml.iter().next().unwrap(); // and strip key off again
-    RemoteStorageConfig::from_toml(storage_conf_parsed_toml).and_then(|parsed_config| {
-        // XXX: Don't print the original toml here, there might be some sensitive data
-        parsed_config.context("Incorrectly parsed remote storage toml as no remote storage config")
-    })
+    RemoteStorageConfig::from_toml(&storage_conf.parse()?)
 }
 
 #[test]
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 067e425570e7..af83feb77fac 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -52,6 +52,12 @@ pub mod defaults {
     pub const DEFAULT_MAX_OFFLOADER_LAG_BYTES: u64 = 128 * (1 << 20);
     pub const DEFAULT_PARTIAL_BACKUP_TIMEOUT: &str = "15m";
     pub const DEFAULT_CONTROL_FILE_SAVE_INTERVAL: &str = "300s";
+    pub const DEFAULT_PARTIAL_BACKUP_CONCURRENCY: &str = "5";
+
+    // By default, our required residency before eviction is the same as the period that passes
+    // before uploading a partial segment, so that in normal operation the eviction can happen
+    // as soon as we have done the partial segment upload.
+    pub const DEFAULT_EVICTION_MIN_RESIDENT: &str = DEFAULT_PARTIAL_BACKUP_TIMEOUT;
 }
 
 #[derive(Debug, Clone)]
@@ -91,6 +97,8 @@ pub struct SafeKeeperConf {
     pub enable_offload: bool,
     pub delete_offloaded_wal: bool,
     pub control_file_save_interval: Duration,
+    pub partial_backup_concurrency: usize,
+    pub eviction_min_resident: Duration,
 }
 
 impl SafeKeeperConf {
@@ -133,6 +141,8 @@ impl SafeKeeperConf {
             enable_offload: false,
             delete_offloaded_wal: false,
             control_file_save_interval: Duration::from_secs(1),
+            partial_backup_concurrency: 1,
+            eviction_min_resident: Duration::ZERO,
         }
     }
 }
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 1e965393e397..539ecf826bf8 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -5,15 +5,15 @@ use std::{
     time::{Instant, SystemTime},
 };
 
-use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_WRITE_SECONDS_BUCKETS};
+use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_FSYNC_SECONDS_BUCKETS};
 use anyhow::Result;
 use futures::Future;
 use metrics::{
     core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
     proto::MetricFamily,
-    register_int_counter, register_int_counter_pair, register_int_counter_pair_vec,
-    register_int_counter_vec, Gauge, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
-    IntGaugeVec,
+    register_histogram_vec, register_int_counter, register_int_counter_pair,
+    register_int_counter_pair_vec, register_int_counter_vec, Gauge, HistogramVec, IntCounter,
+    IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
 };
 use once_cell::sync::Lazy;
 
@@ -48,7 +48,7 @@ pub static WRITE_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "safekeeper_write_wal_seconds",
         "Seconds spent writing and syncing WAL to a disk in a single request",
-        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
     )
     .expect("Failed to register safekeeper_write_wal_seconds histogram")
 });
@@ -56,7 +56,7 @@ pub static FLUSH_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "safekeeper_flush_wal_seconds",
         "Seconds spent syncing WAL to a disk",
-        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
     )
     .expect("Failed to register safekeeper_flush_wal_seconds histogram")
 });
@@ -64,10 +64,28 @@ pub static PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "safekeeper_persist_control_file_seconds",
         "Seconds to persist and sync control file",
-        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
     )
     .expect("Failed to register safekeeper_persist_control_file_seconds histogram vec")
 });
+pub static WAL_STORAGE_OPERATION_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "safekeeper_wal_storage_operation_seconds",
+        "Seconds spent on WAL storage operations",
+        &["operation"],
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_wal_storage_operation_seconds histogram vec")
+});
+pub static MISC_OPERATION_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "safekeeper_misc_operation_seconds",
+        "Seconds spent on miscellaneous operations",
+        &["operation"],
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_misc_operation_seconds histogram vec")
+});
 pub static PG_IO_BYTES: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "safekeeper_pg_io_bytes_total",
@@ -126,7 +144,7 @@ pub static BROKER_PUSH_ALL_UPDATES_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "safekeeper_broker_push_update_seconds",
         "Seconds to push all timeline updates to the broker",
-        DISK_WRITE_SECONDS_BUCKETS.to_vec()
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
     )
     .expect("Failed to register safekeeper_broker_push_update_seconds histogram vec")
 });
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 4d0992e8bda9..33ec39b852f4 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -15,6 +15,7 @@ use storage_broker::proto::SafekeeperTimelineInfo;
 use tracing::*;
 
 use crate::control_file;
+use crate::metrics::MISC_OPERATION_SECONDS;
 use crate::send_wal::HotStandbyFeedback;
 
 use crate::state::TimelineState;
@@ -696,6 +697,10 @@ where
         &mut self,
         msg: &ProposerElected,
     ) -> Result<Option<AcceptorProposerMessage>> {
+        let _timer = MISC_OPERATION_SECONDS
+            .with_label_values(&["handle_elected"])
+            .start_timer();
+
         info!("received ProposerElected {:?}", msg);
         if self.state.acceptor_state.term < msg.term {
             let mut state = self.state.start_change();
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index e0f7b65aef84..dca64140827f 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -189,7 +189,12 @@ where
 
     /// Persist given state. c.f. start_change.
     pub async fn finish_change(&mut self, s: &TimelinePersistentState) -> Result<()> {
-        self.pers.persist(s).await?;
+        if s.eq(&*self.pers) {
+            // nothing to do if state didn't change
+        } else {
+            self.pers.persist(s).await?;
+        }
+
         // keep in memory values up to date
         self.inmem.commit_lsn = s.commit_lsn;
         self.inmem.backup_lsn = s.backup_lsn;
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index f632cd6fb3ec..132e5ec32f4f 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -36,10 +36,10 @@ use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self};
-use crate::wal_backup_partial::PartialRemoteSegment;
+use crate::wal_backup_partial::{PartialRemoteSegment, RateLimiter};
 use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
-use crate::metrics::{FullTimelineInfo, WalStorageMetrics};
+use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
 use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
 use crate::{debug_dump, timeline_manager, wal_storage};
 use crate::{GlobalTimelines, SafeKeeperConf};
@@ -587,6 +587,7 @@ impl Timeline {
         shared_state: &mut WriteGuardSharedState<'_>,
         conf: &SafeKeeperConf,
         broker_active_set: Arc<TimelinesSet>,
+        partial_backup_rate_limiter: RateLimiter,
     ) -> Result<()> {
         match fs::metadata(&self.timeline_dir).await {
             Ok(_) => {
@@ -617,7 +618,7 @@ impl Timeline {
 
             return Err(e);
         }
-        self.bootstrap(conf, broker_active_set);
+        self.bootstrap(conf, broker_active_set, partial_backup_rate_limiter);
         Ok(())
     }
 
@@ -626,6 +627,7 @@ impl Timeline {
         self: &Arc<Timeline>,
         conf: &SafeKeeperConf,
         broker_active_set: Arc<TimelinesSet>,
+        partial_backup_rate_limiter: RateLimiter,
     ) {
         let (tx, rx) = self.manager_ctl.bootstrap_manager();
 
@@ -637,6 +639,7 @@ impl Timeline {
             broker_active_set,
             tx,
             rx,
+            partial_backup_rate_limiter,
         ));
     }
 
@@ -856,28 +859,40 @@ impl Timeline {
         }
 
         debug!("requesting WalResidentTimeline guard");
-
-        // Wait 5 seconds for the guard to be acquired, should be enough for uneviction.
-        // If it times out, most likely there is a deadlock in the manager task.
-        let res = tokio::time::timeout(
-            Duration::from_secs(5),
+        let started_at = Instant::now();
+        let status_before = self.mgr_status.get();
+
+        // Wait 30 seconds for the guard to be acquired. It can time out if someone is
+        // holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task
+        // is stuck.
+        let res = tokio::time::timeout_at(
+            started_at + Duration::from_secs(30),
             self.manager_ctl.wal_residence_guard(),
         )
         .await;
 
         let guard = match res {
-            Ok(Ok(guard)) => guard,
+            Ok(Ok(guard)) => {
+                let finished_at = Instant::now();
+                let elapsed = finished_at - started_at;
+                MISC_OPERATION_SECONDS
+                    .with_label_values(&["wal_residence_guard"])
+                    .observe(elapsed.as_secs_f64());
+
+                guard
+            }
             Ok(Err(e)) => {
                 warn!(
-                    "error while acquiring WalResidentTimeline guard (current state {:?}): {}",
-                    self.mgr_status.get(),
-                    e
+                    "error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
+                    status_before,
+                    self.mgr_status.get()
                 );
                 return Err(e);
             }
             Err(_) => {
                 warn!(
-                    "timeout while acquiring WalResidentTimeline guard (current state {:?})",
+                    "timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
+                    status_before,
                     self.mgr_status.get()
                 );
                 anyhow::bail!("timeout while acquiring WalResidentTimeline guard");
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index b303d41b7bab..e4ab65290d52 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -5,6 +5,7 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use remote_storage::RemotePath;
+use std::time::Instant;
 use tokio::{
     fs::File,
     io::{AsyncRead, AsyncWriteExt},
@@ -48,6 +49,7 @@ impl Manager {
                 .flush_lsn
                 .segment_number(self.wal_seg_size)
                 == self.last_removed_segno + 1
+            && self.resident_since.elapsed() >= self.conf.eviction_min_resident
     }
 
     /// Evict the timeline to remote storage.
@@ -91,6 +93,8 @@ impl Manager {
             return;
         }
 
+        self.resident_since = Instant::now();
+
         info!("successfully restored evicted timeline");
     }
 }
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index c3abeac6449f..debf8c824f2d 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -22,7 +22,7 @@ use utils::lsn::Lsn;
 
 use crate::{
     control_file::{FileStorage, Storage},
-    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL},
+    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
     recovery::recovery_main,
     remove_wal::calc_horizon_lsn,
     safekeeper::Term,
@@ -32,7 +32,7 @@ use crate::{
     timeline_guard::{AccessService, GuardId, ResidenceGuard},
     timelines_set::{TimelineSetGuard, TimelinesSet},
     wal_backup::{self, WalBackupTaskHandle},
-    wal_backup_partial::{self, PartialRemoteSegment},
+    wal_backup_partial::{self, PartialRemoteSegment, RateLimiter},
     SafeKeeperConf,
 };
 
@@ -185,6 +185,11 @@ pub(crate) struct Manager {
 
     // misc
     pub(crate) access_service: AccessService,
+    pub(crate) partial_backup_rate_limiter: RateLimiter,
+
+    // Anti-flapping state: we evict timelines eagerly if they are inactive, but should not
+    // evict them if they go inactive very soon after being restored.
+    pub(crate) resident_since: std::time::Instant,
 }
 
 /// This task gets spawned alongside each timeline and is responsible for managing the timeline's
@@ -197,6 +202,7 @@ pub async fn main_task(
     broker_active_set: Arc<TimelinesSet>,
     manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
     mut manager_rx: tokio::sync::mpsc::UnboundedReceiver<ManagerCtlMessage>,
+    partial_backup_rate_limiter: RateLimiter,
 ) {
     tli.set_status(Status::Started);
 
@@ -209,7 +215,14 @@ pub async fn main_task(
         }
     };
 
-    let mut mgr = Manager::new(tli, conf, broker_active_set, manager_tx).await;
+    let mut mgr = Manager::new(
+        tli,
+        conf,
+        broker_active_set,
+        manager_tx,
+        partial_backup_rate_limiter,
+    )
+    .await;
 
     // Start recovery task which always runs on the timeline.
     if !mgr.is_offloaded && mgr.conf.peer_recovery_enabled {
@@ -321,6 +334,7 @@ impl Manager {
         conf: SafeKeeperConf,
         broker_active_set: Arc<TimelinesSet>,
         manager_tx: tokio::sync::mpsc::UnboundedSender<ManagerCtlMessage>,
+        partial_backup_rate_limiter: RateLimiter,
     ) -> Manager {
         let (is_offloaded, partial_backup_uploaded) = tli.bootstrap_mgr().await;
         Manager {
@@ -339,6 +353,8 @@ impl Manager {
             partial_backup_uploaded,
             access_service: AccessService::new(manager_tx),
             tli,
+            partial_backup_rate_limiter,
+            resident_since: std::time::Instant::now(),
         }
     }
 
@@ -357,6 +373,10 @@ impl Manager {
 
     /// Get a snapshot of the timeline state.
     async fn state_snapshot(&self) -> StateSnapshot {
+        let _timer = MISC_OPERATION_SECONDS
+            .with_label_values(&["state_snapshot"])
+            .start_timer();
+
         StateSnapshot::new(
             self.tli.read_shared_state().await,
             self.conf.heartbeat_timeout,
@@ -521,6 +541,7 @@ impl Manager {
         self.partial_backup_task = Some(tokio::spawn(wal_backup_partial::main_task(
             self.wal_resident_timeline(),
             self.conf.clone(),
+            self.partial_backup_rate_limiter.clone(),
         )));
     }
 
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 45e08ede3c0a..f57da5c7cbf1 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -5,6 +5,7 @@
 use crate::safekeeper::ServerInfo;
 use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
+use crate::wal_backup_partial::RateLimiter;
 use crate::SafeKeeperConf;
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
@@ -14,15 +15,23 @@ use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
 use tracing::*;
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 
 struct GlobalTimelinesState {
     timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
+
+    // A tombstone indicates this timeline used to exist has been deleted.  These are used to prevent
+    // on-demand timeline creation from recreating deleted timelines.  This is only soft-enforced, as
+    // this map is dropped on restart.
+    tombstones: HashMap<TenantTimelineId, Instant>,
+
     conf: Option<SafeKeeperConf>,
     broker_active_set: Arc<TimelinesSet>,
     load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
+    partial_backup_rate_limiter: RateLimiter,
 }
 
 // Used to prevent concurrent timeline loading.
@@ -37,8 +46,12 @@ impl GlobalTimelinesState {
     }
 
     /// Get dependencies for a timeline constructor.
-    fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>) {
-        (self.get_conf().clone(), self.broker_active_set.clone())
+    fn get_dependencies(&self) -> (SafeKeeperConf, Arc<TimelinesSet>, RateLimiter) {
+        (
+            self.get_conf().clone(),
+            self.broker_active_set.clone(),
+            self.partial_backup_rate_limiter.clone(),
+        )
     }
 
     /// Insert timeline into the map. Returns error if timeline with the same id already exists.
@@ -58,14 +71,21 @@ impl GlobalTimelinesState {
             .cloned()
             .ok_or(TimelineError::NotFound(*ttid))
     }
+
+    fn delete(&mut self, ttid: TenantTimelineId) {
+        self.timelines.remove(&ttid);
+        self.tombstones.insert(ttid, Instant::now());
+    }
 }
 
 static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
     Mutex::new(GlobalTimelinesState {
         timelines: HashMap::new(),
+        tombstones: HashMap::new(),
         conf: None,
         broker_active_set: Arc::new(TimelinesSet::default()),
         load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
+        partial_backup_rate_limiter: RateLimiter::new(1),
     })
 });
 
@@ -79,6 +99,7 @@ impl GlobalTimelines {
         // lock, so use explicit block
         let tenants_dir = {
             let mut state = TIMELINES_STATE.lock().unwrap();
+            state.partial_backup_rate_limiter = RateLimiter::new(conf.partial_backup_concurrency);
             state.conf = Some(conf);
 
             // Iterate through all directories and load tenants for all directories
@@ -122,7 +143,7 @@ impl GlobalTimelines {
     /// this function is called during init when nothing else is running, so
     /// this is fine.
     async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
-        let (conf, broker_active_set) = {
+        let (conf, broker_active_set, partial_backup_rate_limiter) = {
             let state = TIMELINES_STATE.lock().unwrap();
             state.get_dependencies()
         };
@@ -145,7 +166,11 @@ impl GlobalTimelines {
                                     .unwrap()
                                     .timelines
                                     .insert(ttid, tli.clone());
-                                tli.bootstrap(&conf, broker_active_set.clone());
+                                tli.bootstrap(
+                                    &conf,
+                                    broker_active_set.clone(),
+                                    partial_backup_rate_limiter.clone(),
+                                );
                             }
                             // If we can't load a timeline, it's most likely because of a corrupted
                             // directory. We will log an error and won't allow to delete/recreate
@@ -178,20 +203,27 @@ impl GlobalTimelines {
         _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
         ttid: TenantTimelineId,
     ) -> Result<Arc<Timeline>> {
-        let (conf, broker_active_set) = TIMELINES_STATE.lock().unwrap().get_dependencies();
+        let (conf, broker_active_set, partial_backup_rate_limiter) =
+            TIMELINES_STATE.lock().unwrap().get_dependencies();
 
         match Timeline::load_timeline(&conf, ttid) {
             Ok(timeline) => {
                 let tli = Arc::new(timeline);
 
                 // TODO: prevent concurrent timeline creation/loading
-                TIMELINES_STATE
-                    .lock()
-                    .unwrap()
-                    .timelines
-                    .insert(ttid, tli.clone());
+                {
+                    let mut state = TIMELINES_STATE.lock().unwrap();
+
+                    // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`).  We trust
+                    // that the human doing this manual intervention knows what they are doing, and remove its tombstone.
+                    if state.tombstones.remove(&ttid).is_some() {
+                        warn!("Un-deleted timeline {ttid}");
+                    }
+
+                    state.timelines.insert(ttid, tli.clone());
+                }
 
-                tli.bootstrap(&conf, broker_active_set);
+                tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter);
 
                 Ok(tli)
             }
@@ -216,18 +248,23 @@ impl GlobalTimelines {
 
     /// Create a new timeline with the given id. If the timeline already exists, returns
     /// an existing timeline.
-    pub async fn create(
+    pub(crate) async fn create(
         ttid: TenantTimelineId,
         server_info: ServerInfo,
         commit_lsn: Lsn,
         local_start_lsn: Lsn,
     ) -> Result<Arc<Timeline>> {
-        let (conf, broker_active_set) = {
+        let (conf, broker_active_set, partial_backup_rate_limiter) = {
             let state = TIMELINES_STATE.lock().unwrap();
             if let Ok(timeline) = state.get(&ttid) {
                 // Timeline already exists, return it.
                 return Ok(timeline);
             }
+
+            if state.tombstones.contains_key(&ttid) {
+                anyhow::bail!("Timeline {ttid} is deleted, refusing to recreate");
+            }
+
             state.get_dependencies()
         };
 
@@ -257,7 +294,12 @@ impl GlobalTimelines {
             // Bootstrap is transactional, so if it fails, the timeline will be deleted,
             // and the state on disk should remain unchanged.
             if let Err(e) = timeline
-                .init_new(&mut shared_state, &conf, broker_active_set)
+                .init_new(
+                    &mut shared_state,
+                    &conf,
+                    broker_active_set,
+                    partial_backup_rate_limiter,
+                )
                 .await
             {
                 // Note: the most likely reason for init failure is that the timeline
@@ -282,17 +324,19 @@ impl GlobalTimelines {
     /// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
     /// or was corrupted and couldn't be loaded on startup. Returned timeline is always valid,
     /// i.e. loaded in memory and not cancelled.
-    pub fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
-        let res = TIMELINES_STATE.lock().unwrap().get(&ttid);
-
-        match res {
+    pub(crate) fn get(ttid: TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
+        let tli_res = {
+            let state = TIMELINES_STATE.lock().unwrap();
+            state.get(&ttid)
+        };
+        match tli_res {
             Ok(tli) => {
                 if tli.is_cancelled() {
                     return Err(TimelineError::Cancelled(ttid));
                 }
                 Ok(tli)
             }
-            _ => res,
+            _ => tli_res,
         }
     }
 
@@ -321,12 +365,26 @@ impl GlobalTimelines {
 
     /// Cancels timeline, then deletes the corresponding data directory.
     /// If only_local, doesn't remove WAL segments in remote storage.
-    pub async fn delete(
+    pub(crate) async fn delete(
         ttid: &TenantTimelineId,
         only_local: bool,
     ) -> Result<TimelineDeleteForceResult> {
-        let tli_res = TIMELINES_STATE.lock().unwrap().get(ttid);
-        match tli_res {
+        let tli_res = {
+            let state = TIMELINES_STATE.lock().unwrap();
+
+            if state.tombstones.contains_key(ttid) {
+                // Presence of a tombstone guarantees that a previous deletion has completed and there is no work to do.
+                info!("Timeline {ttid} was already deleted");
+                return Ok(TimelineDeleteForceResult {
+                    dir_existed: false,
+                    was_active: false,
+                });
+            }
+
+            state.get(ttid)
+        };
+
+        let result = match tli_res {
             Ok(timeline) => {
                 let was_active = timeline.broker_active.load(Ordering::Relaxed);
 
@@ -336,11 +394,6 @@ impl GlobalTimelines {
                 info!("deleting timeline {}, only_local={}", ttid, only_local);
                 let dir_existed = timeline.delete(&mut shared_state, only_local).await?;
 
-                // Remove timeline from the map.
-                // FIXME: re-enable it once we fix the issue with recreation of deleted timelines
-                // https://github.com/neondatabase/neon/issues/3146
-                // TIMELINES_STATE.lock().unwrap().timelines.remove(ttid);
-
                 Ok(TimelineDeleteForceResult {
                     dir_existed,
                     was_active, // TODO: we probably should remove this field
@@ -356,7 +409,14 @@ impl GlobalTimelines {
                     was_active: false,
                 })
             }
-        }
+        };
+
+        // Finalize deletion, by dropping Timeline objects and storing smaller tombstones.  The tombstones
+        // are used to prevent still-running computes from re-creating the same timeline when they send data,
+        // and to speed up repeated deletion calls by avoiding re-listing objects.
+        TIMELINES_STATE.lock().unwrap().delete(*ttid);
+
+        result
     }
 
     /// Deactivates and deletes all timelines for the tenant. Returns map of all timelines which
@@ -402,19 +462,20 @@ impl GlobalTimelines {
             tenant_id,
         ))?;
 
-        // FIXME: we temporarily disabled removing timelines from the map, see `delete_force`
-        // let tlis_after_delete = Self::get_all_for_tenant(*tenant_id);
-        // if !tlis_after_delete.is_empty() {
-        //     // Some timelines were created while we were deleting them, returning error
-        //     // to the caller, so it can retry later.
-        //     bail!(
-        //         "failed to delete all timelines for tenant {}: some timelines were created while we were deleting them",
-        //         tenant_id
-        //     );
-        // }
-
         Ok(deleted)
     }
+
+    pub fn housekeeping(tombstone_ttl: &Duration) {
+        let mut state = TIMELINES_STATE.lock().unwrap();
+
+        // We keep tombstones long enough to have a good chance of preventing rogue computes from re-creating deleted
+        // timelines.  If a compute kept running for longer than this TTL (or across a safekeeper restart) then they
+        // may recreate a deleted timeline.
+        let now = Instant::now();
+        state
+            .tombstones
+            .retain(|_, v| now.duration_since(*v) < *tombstone_ttl);
+    }
 }
 
 #[derive(Clone, Copy, Serialize)]
diff --git a/safekeeper/src/wal_backup_partial.rs b/safekeeper/src/wal_backup_partial.rs
index 9c7cd0888d83..825851c97c9a 100644
--- a/safekeeper/src/wal_backup_partial.rs
+++ b/safekeeper/src/wal_backup_partial.rs
@@ -18,6 +18,8 @@
 //! This way control file stores information about all potentially existing
 //! remote partial segments and can clean them up after uploading a newer version.
 
+use std::sync::Arc;
+
 use camino::Utf8PathBuf;
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
 use remote_storage::RemotePath;
@@ -27,7 +29,7 @@ use tracing::{debug, error, info, instrument, warn};
 use utils::lsn::Lsn;
 
 use crate::{
-    metrics::{PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
+    metrics::{MISC_OPERATION_SECONDS, PARTIAL_BACKUP_UPLOADED_BYTES, PARTIAL_BACKUP_UPLOADS},
     safekeeper::Term,
     timeline::WalResidentTimeline,
     timeline_manager::StateSnapshot,
@@ -35,6 +37,30 @@ use crate::{
     SafeKeeperConf,
 };
 
+#[derive(Clone)]
+pub struct RateLimiter {
+    semaphore: Arc<tokio::sync::Semaphore>,
+}
+
+impl RateLimiter {
+    pub fn new(permits: usize) -> Self {
+        Self {
+            semaphore: Arc::new(tokio::sync::Semaphore::new(permits)),
+        }
+    }
+
+    async fn acquire_owned(&self) -> tokio::sync::OwnedSemaphorePermit {
+        let _timer = MISC_OPERATION_SECONDS
+            .with_label_values(&["partial_permit_acquire"])
+            .start_timer();
+        self.semaphore
+            .clone()
+            .acquire_owned()
+            .await
+            .expect("semaphore is closed")
+    }
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub enum UploadStatus {
     /// Upload is in progress. This status should be used only for garbage collection,
@@ -208,6 +234,9 @@ impl PartialBackup {
     /// Upload the latest version of the partial segment and garbage collect older versions.
     #[instrument(name = "upload", skip_all, fields(name = %prepared.name))]
     async fn do_upload(&mut self, prepared: &PartialRemoteSegment) -> anyhow::Result<()> {
+        let _timer = MISC_OPERATION_SECONDS
+            .with_label_values(&["partial_do_upload"])
+            .start_timer();
         info!("starting upload {:?}", prepared);
 
         let state_0 = self.state.clone();
@@ -307,6 +336,7 @@ pub(crate) fn needs_uploading(
 pub async fn main_task(
     tli: WalResidentTimeline,
     conf: SafeKeeperConf,
+    limiter: RateLimiter,
 ) -> Option<PartialRemoteSegment> {
     debug!("started");
     let await_duration = conf.partial_backup_timeout;
@@ -411,6 +441,9 @@ pub async fn main_task(
             continue 'outer;
         }
 
+        // limit concurrent uploads
+        let _upload_permit = limiter.acquire_owned().await;
+
         let prepared = backup.prepare_upload().await;
         if let Some(seg) = &uploaded_segment {
             if seg.eq_without_status(&prepared) {
diff --git a/safekeeper/src/wal_service.rs b/safekeeper/src/wal_service.rs
index 4a97eb3993f3..091571111e5c 100644
--- a/safekeeper/src/wal_service.rs
+++ b/safekeeper/src/wal_service.rs
@@ -4,9 +4,10 @@
 //!
 use anyhow::{Context, Result};
 use postgres_backend::QueryError;
-use std::{future, time::Duration};
+use std::time::Duration;
 use tokio::net::TcpStream;
 use tokio_io_timeout::TimeoutReader;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{auth::Scope, measured_stream::MeasuredStream};
 
@@ -100,7 +101,7 @@ async fn handle_socket(
     // libpq protocol between safekeeper and walproposer / pageserver
     // We don't use shutdown.
     pgbackend
-        .run(&mut conn_handler, future::pending::<()>)
+        .run(&mut conn_handler, &CancellationToken::new())
         .await
 }
 
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 74c4693ccd9b..ded8571a3e27 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -23,7 +23,9 @@ use tokio::io::{AsyncReadExt, AsyncSeekExt};
 use tracing::*;
 use utils::crashsafe::durable_rename;
 
-use crate::metrics::{time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS};
+use crate::metrics::{
+    time_io_closure, WalStorageMetrics, REMOVED_WAL_SEGMENTS, WAL_STORAGE_OPERATION_SECONDS,
+};
 use crate::state::TimelinePersistentState;
 use crate::wal_backup::{read_object, remote_timeline_path};
 use crate::SafeKeeperConf;
@@ -331,6 +333,10 @@ impl Storage for PhysicalStorage {
     }
 
     async fn initialize_first_segment(&mut self, init_lsn: Lsn) -> Result<()> {
+        let _timer = WAL_STORAGE_OPERATION_SECONDS
+            .with_label_values(&["initialize_first_segment"])
+            .start_timer();
+
         let segno = init_lsn.segment_number(self.wal_seg_size);
         let (mut file, _) = self.open_or_create(segno).await?;
         let major_pg_version = self.pg_version / 10000;
@@ -422,6 +428,10 @@ impl Storage for PhysicalStorage {
     /// Truncate written WAL by removing all WAL segments after the given LSN.
     /// end_pos must point to the end of the WAL record.
     async fn truncate_wal(&mut self, end_pos: Lsn) -> Result<()> {
+        let _timer = WAL_STORAGE_OPERATION_SECONDS
+            .with_label_values(&["truncate_wal"])
+            .start_timer();
+
         // Streaming must not create a hole, so truncate cannot be called on non-written lsn
         if self.write_lsn != Lsn(0) && end_pos > self.write_lsn {
             bail!(
@@ -497,6 +507,10 @@ async fn remove_segments_from_disk(
     wal_seg_size: usize,
     remove_predicate: impl Fn(XLogSegNo) -> bool,
 ) -> Result<()> {
+    let _timer = WAL_STORAGE_OPERATION_SECONDS
+        .with_label_values(&["remove_segments_from_disk"])
+        .start_timer();
+
     let mut n_removed = 0;
     let mut min_removed = u64::MAX;
     let mut max_removed = u64::MIN;
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 43835c7f4411..0c6d97ddfaad 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -187,6 +187,8 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         enable_offload: false,
         delete_offloaded_wal: false,
         control_file_save_interval: Duration::from_secs(1),
+        partial_backup_concurrency: 1,
+        eviction_min_resident: Duration::ZERO,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/storage_controller/src/background_node_operations.rs b/storage_controller/src/background_node_operations.rs
index 74b7e7c84955..6f1355eb6848 100644
--- a/storage_controller/src/background_node_operations.rs
+++ b/storage_controller/src/background_node_operations.rs
@@ -3,7 +3,7 @@ use std::{borrow::Cow, fmt::Debug, fmt::Display};
 use tokio_util::sync::CancellationToken;
 use utils::id::NodeId;
 
-pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 10;
+pub(crate) const MAX_RECONCILES_PER_OPERATION: usize = 32;
 
 #[derive(Copy, Clone)]
 pub(crate) struct Drain {
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index 4d0f8006aaa4..c46539485c1f 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -323,7 +323,7 @@ impl ComputeHook {
             if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
                 tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                 endpoint
-                    .reconfigure(compute_pageservers.clone(), *stripe_size)
+                    .reconfigure(compute_pageservers.clone(), *stripe_size, None)
                     .await
                     .map_err(NotifyError::NeonLocal)?;
             }
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 680e6f09c4a0..7446ad53a231 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -10,8 +10,9 @@ use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use metrics::{BuildInfo, NeonMetrics};
+use pageserver_api::controller_api::TenantCreateRequest;
 use pageserver_api::models::{
-    TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
+    TenantConfigRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
     TenantTimeTravelRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 4d17dff9feaf..fff44aaf2670 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -226,7 +226,7 @@ impl Node {
         fn is_fatal(e: &mgmt_api::Error) -> bool {
             use mgmt_api::Error::*;
             match e {
-                ReceiveBody(_) | ReceiveErrorBody(_) => false,
+                SendRequest(_) | ReceiveBody(_) | ReceiveErrorBody(_) => false,
                 ApiError(StatusCode::SERVICE_UNAVAILABLE, _)
                 | ApiError(StatusCode::GATEWAY_TIMEOUT, _)
                 | ApiError(StatusCode::REQUEST_TIMEOUT, _) => false,
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index fe97f724c132..886ceae90fbf 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -1,6 +1,7 @@
 use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
+use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::models::{
     LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -29,6 +30,7 @@ pub(super) struct Reconciler {
     /// of a tenant's state from when we spawned a reconcile task.
     pub(super) tenant_shard_id: TenantShardId,
     pub(crate) shard: ShardIdentity,
+    pub(crate) placement_policy: PlacementPolicy,
     pub(crate) generation: Option<Generation>,
     pub(crate) intent: TargetState,
 
@@ -641,7 +643,7 @@ impl Reconciler {
                 generation,
                 &self.shard,
                 &self.config,
-                !self.intent.secondary.is_empty(),
+                &self.placement_policy,
             );
             match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
@@ -801,8 +803,15 @@ pub(crate) fn attached_location_conf(
     generation: Generation,
     shard: &ShardIdentity,
     config: &TenantConfig,
-    has_secondaries: bool,
+    policy: &PlacementPolicy,
 ) -> LocationConfig {
+    let has_secondaries = match policy {
+        PlacementPolicy::Attached(0) | PlacementPolicy::Detached | PlacementPolicy::Secondary => {
+            false
+        }
+        PlacementPolicy::Attached(_) => true,
+    };
+
     LocationConfig {
         mode: LocationConfigMode::AttachedSingle,
         generation: generation.into(),
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index e329f42dd610..aada1939eeea 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -32,10 +32,10 @@ use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
         NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-        ShardSchedulingPolicy, TenantCreateResponse, TenantCreateResponseShard,
-        TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
-        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
-        UtilizationScore,
+        ShardSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
+        TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
+        TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
+        TenantShardMigrateResponse, UtilizationScore,
     },
     models::{SecondaryProgress, TenantConfigRequest, TopTenantShardsRequest},
 };
@@ -46,10 +46,9 @@ use crate::pageserver_client::PageserverClient;
 use pageserver_api::{
     models::{
         self, LocationConfig, LocationConfigListResponse, LocationConfigMode,
-        PageserverUtilization, ShardParameters, TenantConfig, TenantCreateRequest,
-        TenantLocationConfigRequest, TenantLocationConfigResponse, TenantShardLocation,
-        TenantShardSplitRequest, TenantShardSplitResponse, TenantTimeTravelRequest,
-        TimelineCreateRequest, TimelineInfo,
+        PageserverUtilization, ShardParameters, TenantConfig, TenantLocationConfigRequest,
+        TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
+        TenantShardSplitResponse, TenantTimeTravelRequest, TimelineCreateRequest, TimelineInfo,
     },
     shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize, TenantShardId},
     upcall_api::{
@@ -152,6 +151,10 @@ struct ServiceState {
 /// controller API.
 fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
     match e {
+        mgmt_api::Error::SendRequest(e) => {
+            // Presume errors sending requests are connectivity/availability issues
+            ApiError::ResourceUnavailable(format!("{node} error sending request: {e}").into())
+        }
         mgmt_api::Error::ReceiveErrorBody(str) => {
             // Presume errors receiving body are connectivity/availability issues
             ApiError::ResourceUnavailable(
@@ -1391,7 +1394,7 @@ impl Service {
                             tenant_shard.generation.unwrap(),
                             &tenant_shard.shard,
                             &tenant_shard.config,
-                            false,
+                            &PlacementPolicy::Attached(0),
                         )),
                     },
                 )]);
@@ -3322,7 +3325,7 @@ impl Service {
                                 generation,
                                 &child_shard,
                                 &config,
-                                matches!(policy, PlacementPolicy::Attached(n) if n > 0),
+                                &policy,
                             )),
                         },
                     );
@@ -4063,7 +4066,14 @@ impl Service {
                 placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking
 
                 // There is no way to know what the tenant's config was: revert to defaults
-                config: TenantConfig::default(),
+                //
+                // TODO: remove `switch_aux_file_policy` once we finish auxv2 migration
+                //
+                // we write to both v1+v2 storage, so that the test case can use either storage format for testing
+                config: TenantConfig {
+                    switch_aux_file_policy: Some(models::AuxFilePolicy::CrossValidation),
+                    ..TenantConfig::default()
+                },
             })
             .await?;
 
@@ -5564,9 +5574,12 @@ impl Service {
                 break;
             }
 
-            let mut can_take = attached - expected_attached;
+            let can_take = attached - expected_attached;
+            let needed = fill_requirement - plan.len();
+            let mut take = std::cmp::min(can_take, needed);
+
             let mut remove_node = false;
-            while can_take > 0 {
+            while take > 0 {
                 match tids_by_node.get_mut(&node_id) {
                     Some(tids) => match tids.pop() {
                         Some(tid) => {
@@ -5578,7 +5591,7 @@ impl Service {
                             if *promoted < max_promote_for_tenant {
                                 plan.push(tid);
                                 *promoted += 1;
-                                can_take -= 1;
+                                take -= 1;
                             }
                         }
                         None => {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 45295bc59be8..3fcf31ac1028 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -908,12 +908,8 @@ impl TenantShard {
                 .generation
                 .expect("Attempted to enter attached state without a generation");
 
-            let wanted_conf = attached_location_conf(
-                generation,
-                &self.shard,
-                &self.config,
-                !self.intent.secondary.is_empty(),
-            );
+            let wanted_conf =
+                attached_location_conf(generation, &self.shard, &self.config, &self.policy);
             match self.observed.locations.get(&node_id) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                 Some(_) | None => {
@@ -1099,6 +1095,7 @@ impl TenantShard {
         let mut reconciler = Reconciler {
             tenant_shard_id: self.tenant_shard_id,
             shard: self.shard,
+            placement_policy: self.policy.clone(),
             generation: self.generation,
             intent: reconciler_intent,
             detach,
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 4eb8580e32cf..f687b24320ce 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -259,7 +259,7 @@ pub(crate) enum BlobDataParseResult {
     Incorrect(Vec<String>),
 }
 
-fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
+pub(crate) fn parse_layer_object_name(name: &str) -> Result<(LayerName, Generation), String> {
     match name.rsplit_once('-') {
         // FIXME: this is gross, just use a regex?
         Some((layer_filename, gen)) if gen.len() == 8 => {
diff --git a/storage_scrubber/src/find_large_objects.rs b/storage_scrubber/src/find_large_objects.rs
new file mode 100644
index 000000000000..2ef802229d1d
--- /dev/null
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -0,0 +1,120 @@
+use futures::{StreamExt, TryStreamExt};
+use pageserver::tenant::storage_layer::LayerName;
+use serde::{Deserialize, Serialize};
+
+use crate::{
+    checks::parse_layer_object_name, init_remote, list_objects_with_retries,
+    metadata_stream::stream_tenants, BucketConfig, NodeKind,
+};
+
+#[derive(Serialize, Deserialize, Clone, Copy, PartialEq, Eq)]
+enum LargeObjectKind {
+    DeltaLayer,
+    ImageLayer,
+    Other,
+}
+
+impl LargeObjectKind {
+    fn from_key(key: &str) -> Self {
+        let fname = key.split('/').last().unwrap();
+
+        let Ok((layer_name, _generation)) = parse_layer_object_name(fname) else {
+            return LargeObjectKind::Other;
+        };
+
+        match layer_name {
+            LayerName::Image(_) => LargeObjectKind::ImageLayer,
+            LayerName::Delta(_) => LargeObjectKind::DeltaLayer,
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Clone)]
+pub struct LargeObject {
+    pub key: String,
+    pub size: u64,
+    kind: LargeObjectKind,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct LargeObjectListing {
+    pub objects: Vec<LargeObject>,
+}
+
+pub async fn find_large_objects(
+    bucket_config: BucketConfig,
+    min_size: u64,
+    ignore_deltas: bool,
+    concurrency: usize,
+) -> anyhow::Result<LargeObjectListing> {
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
+    let tenants = std::pin::pin!(stream_tenants(&s3_client, &target));
+
+    let objects_stream = tenants.map_ok(|tenant_shard_id| {
+        let mut tenant_root = target.tenant_root(&tenant_shard_id);
+        let s3_client = s3_client.clone();
+        async move {
+            let mut objects = Vec::new();
+            let mut total_objects_ctr = 0u64;
+            // We want the objects and not just common prefixes
+            tenant_root.delimiter.clear();
+            let mut continuation_token = None;
+            loop {
+                let fetch_response =
+                    list_objects_with_retries(&s3_client, &tenant_root, continuation_token.clone())
+                        .await?;
+                for obj in fetch_response.contents().iter().filter(|o| {
+                    if let Some(obj_size) = o.size {
+                        min_size as i64 <= obj_size
+                    } else {
+                        false
+                    }
+                }) {
+                    let key = obj.key().expect("couldn't get key").to_owned();
+                    let kind = LargeObjectKind::from_key(&key);
+                    if ignore_deltas && kind == LargeObjectKind::DeltaLayer {
+                        continue;
+                    }
+                    objects.push(LargeObject {
+                        key,
+                        size: obj.size.unwrap() as u64,
+                        kind,
+                    })
+                }
+                total_objects_ctr += fetch_response.contents().len() as u64;
+                match fetch_response.next_continuation_token {
+                    Some(new_token) => continuation_token = Some(new_token),
+                    None => break,
+                }
+            }
+
+            Ok((tenant_shard_id, objects, total_objects_ctr))
+        }
+    });
+    let mut objects_stream = std::pin::pin!(objects_stream.try_buffer_unordered(concurrency));
+
+    let mut objects = Vec::new();
+
+    let mut tenant_ctr = 0u64;
+    let mut object_ctr = 0u64;
+    while let Some(res) = objects_stream.next().await {
+        let (tenant_shard_id, objects_slice, total_objects_ctr) = res?;
+        objects.extend_from_slice(&objects_slice);
+
+        object_ctr += total_objects_ctr;
+        tenant_ctr += 1;
+        if tenant_ctr % 100 == 0 {
+            tracing::info!(
+                "Scanned {tenant_ctr} shards. objects={object_ctr}, found={}, current={tenant_shard_id}.",
+                objects.len()
+            );
+        }
+    }
+
+    let bucket_name = target.bucket_name();
+    tracing::info!(
+        "Scan of {bucket_name} finished. Scanned {tenant_ctr} shards. objects={object_ctr}, found={}.",
+        objects.len()
+    );
+    Ok(LargeObjectListing { objects })
+}
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index ce0ff10ec6e1..04508519881e 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -140,7 +140,7 @@ async fn find_garbage_inner(
     node_kind: NodeKind,
 ) -> anyhow::Result<GarbageList> {
     // Construct clients for S3 and for Console API
-    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind)?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), node_kind).await?;
     let cloud_admin_api_client = Arc::new(CloudAdminApiClient::new(console_config));
 
     // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
@@ -432,7 +432,7 @@ pub async fn purge_garbage(
     );
 
     let (s3_client, target) =
-        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind)?;
+        init_remote(garbage_list.bucket_config.clone(), garbage_list.node_kind).await?;
 
     // Sanity checks on the incoming list
     if garbage_list.active_tenant_count == 0 {
diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 64273432fc0c..9102ad9906f2 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -2,6 +2,7 @@
 #![deny(clippy::undocumented_unsafe_blocks)]
 pub mod checks;
 pub mod cloud_admin_api;
+pub mod find_large_objects;
 pub mod garbage;
 pub mod metadata_stream;
 pub mod pageserver_physical_gc;
@@ -14,17 +15,10 @@ use std::fmt::Display;
 use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::Context;
-use aws_config::environment::EnvironmentVariableCredentialsProvider;
-use aws_config::imds::credentials::ImdsCredentialsProvider;
-use aws_config::meta::credentials::CredentialsProviderChain;
-use aws_config::profile::ProfileFileCredentialsProvider;
-use aws_config::retry::RetryConfig;
-use aws_config::sso::SsoCredentialsProvider;
-use aws_config::BehaviorVersion;
-use aws_sdk_s3::config::{AsyncSleep, Region, SharedAsyncSleep};
-use aws_sdk_s3::{Client, Config};
-use aws_smithy_async::rt::sleep::TokioSleep;
+use anyhow::{anyhow, Context};
+use aws_sdk_s3::config::Region;
+use aws_sdk_s3::error::DisplayErrorContext;
+use aws_sdk_s3::Client;
 
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::ValueEnum;
@@ -241,85 +235,53 @@ impl ConsoleConfig {
     }
 }
 
-pub fn init_logging(file_name: &str) -> WorkerGuard {
-    let (file_writer, guard) =
-        tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name));
-
-    let file_logs = fmt::Layer::new()
-        .with_target(false)
-        .with_ansi(false)
-        .with_writer(file_writer);
+pub fn init_logging(file_name: &str) -> Option<WorkerGuard> {
     let stderr_logs = fmt::Layer::new()
         .with_target(false)
         .with_writer(std::io::stderr);
-    tracing_subscriber::registry()
-        .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
-        .with(file_logs)
-        .with(stderr_logs)
-        .init();
-
-    guard
-}
 
-pub fn init_s3_client(bucket_region: Region) -> Client {
-    let credentials_provider = {
-        // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-        let chain = CredentialsProviderChain::first_try(
-            "env",
-            EnvironmentVariableCredentialsProvider::new(),
-        )
-        // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-        .or_else(
-            "profile-sso",
-            ProfileFileCredentialsProvider::builder().build(),
-        );
-
-        // Use SSO if we were given an account ID
-        match std::env::var("SSO_ACCOUNT_ID").ok() {
-            Some(sso_account) => chain.or_else(
-                "sso",
-                SsoCredentialsProvider::builder()
-                    .account_id(sso_account)
-                    .role_name("PowerUserAccess")
-                    .start_url("https://neondb.awsapps.com/start")
-                    .region(bucket_region.clone())
-                    .build(),
-            ),
-            None => chain,
-        }
-        .or_else(
-            // Finally try IMDS
-            "imds",
-            ImdsCredentialsProvider::builder().build(),
-        )
+    let disable_file_logging = match std::env::var("PAGESERVER_DISABLE_FILE_LOGGING") {
+        Ok(s) => s == "1" || s.to_lowercase() == "true",
+        Err(_) => false,
     };
 
-    let sleep_impl: Arc<dyn AsyncSleep> = Arc::new(TokioSleep::new());
-
-    let mut builder = Config::builder()
-        .behavior_version(
-            #[allow(deprecated)] /* TODO: https://github.com/neondatabase/neon/issues/7665 */
-            BehaviorVersion::v2023_11_09(),
-        )
-        .region(bucket_region)
-        .retry_config(RetryConfig::adaptive().with_max_attempts(3))
-        .sleep_impl(SharedAsyncSleep::from(sleep_impl))
-        .credentials_provider(credentials_provider);
-
-    if let Ok(endpoint) = env::var("AWS_ENDPOINT_URL") {
-        builder = builder.endpoint_url(endpoint)
+    if disable_file_logging {
+        tracing_subscriber::registry()
+            .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
+            .with(stderr_logs)
+            .init();
+        None
+    } else {
+        let (file_writer, guard) =
+            tracing_appender::non_blocking(tracing_appender::rolling::never("./logs/", file_name));
+        let file_logs = fmt::Layer::new()
+            .with_target(false)
+            .with_ansi(false)
+            .with_writer(file_writer);
+        tracing_subscriber::registry()
+            .with(EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")))
+            .with(stderr_logs)
+            .with(file_logs)
+            .init();
+        Some(guard)
     }
+}
 
-    Client::from_conf(builder.build())
+pub async fn init_s3_client(bucket_region: Region) -> Client {
+    let config = aws_config::defaults(aws_config::BehaviorVersion::v2024_03_28())
+        .region(bucket_region)
+        .load()
+        .await;
+    Client::new(&config)
 }
 
-fn init_remote(
+async fn init_remote(
     bucket_config: BucketConfig,
     node_kind: NodeKind,
 ) -> anyhow::Result<(Arc<Client>, RootTarget)> {
     let bucket_region = Region::new(bucket_config.region);
     let delimiter = "/".to_string();
-    let s3_client = Arc::new(init_s3_client(bucket_region));
+    let s3_client = Arc::new(init_s3_client(bucket_region).await);
 
     let s3_root = match node_kind {
         NodeKind::Pageserver => RootTarget::Pageserver(S3Target {
@@ -344,7 +306,7 @@ async fn list_objects_with_retries(
     s3_target: &S3Target,
     continuation_token: Option<String>,
 ) -> anyhow::Result<aws_sdk_s3::operation::list_objects_v2::ListObjectsV2Output> {
-    for _ in 0..MAX_RETRIES {
+    for trial in 0..MAX_RETRIES {
         match s3_client
             .list_objects_v2()
             .bucket(&s3_target.bucket_name)
@@ -356,16 +318,22 @@ async fn list_objects_with_retries(
         {
             Ok(response) => return Ok(response),
             Err(e) => {
+                if trial == MAX_RETRIES - 1 {
+                    return Err(e)
+                        .with_context(|| format!("Failed to list objects {MAX_RETRIES} times"));
+                }
                 error!(
-                    "list_objects_v2 query failed: {e}, bucket_name={}, prefix={}, delimiter={}",
-                    s3_target.bucket_name, s3_target.prefix_in_bucket, s3_target.delimiter
+                    "list_objects_v2 query failed: bucket_name={}, prefix={}, delimiter={}, error={}",
+                    s3_target.bucket_name,
+                    s3_target.prefix_in_bucket,
+                    s3_target.delimiter,
+                    DisplayErrorContext(e),
                 );
                 tokio::time::sleep(Duration::from_secs(1)).await;
             }
         }
     }
-
-    anyhow::bail!("Failed to list objects {MAX_RETRIES} times")
+    Err(anyhow!("unreachable unless MAX_RETRIES==0"))
 }
 
 async fn download_object_with_retries(
diff --git a/storage_scrubber/src/main.rs b/storage_scrubber/src/main.rs
index 222bd10ed248..d81612119263 100644
--- a/storage_scrubber/src/main.rs
+++ b/storage_scrubber/src/main.rs
@@ -1,6 +1,7 @@
 use anyhow::bail;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
+use storage_scrubber::find_large_objects;
 use storage_scrubber::garbage::{find_garbage, purge_garbage, PurgeMode};
 use storage_scrubber::pageserver_physical_gc::GcMode;
 use storage_scrubber::scan_pageserver_metadata::scan_metadata;
@@ -72,6 +73,14 @@ enum Command {
         #[arg(short, long, default_value_t = GcMode::IndicesOnly)]
         mode: GcMode,
     },
+    FindLargeObjects {
+        #[arg(long = "min-size")]
+        min_size: u64,
+        #[arg(short, long, default_value_t = false)]
+        ignore_deltas: bool,
+        #[arg(long = "concurrency", short = 'j', default_value_t = 64)]
+        concurrency: usize,
+    },
 }
 
 #[tokio::main]
@@ -86,6 +95,7 @@ async fn main() -> anyhow::Result<()> {
         Command::PurgeGarbage { .. } => "purge-garbage",
         Command::TenantSnapshot { .. } => "tenant-snapshot",
         Command::PageserverPhysicalGc { .. } => "pageserver-physical-gc",
+        Command::FindLargeObjects { .. } => "find-large-objects",
     };
     let _guard = init_logging(&format!(
         "{}_{}_{}_{}.log",
@@ -186,7 +196,7 @@ async fn main() -> anyhow::Result<()> {
             concurrency,
         } => {
             let downloader =
-                SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency)?;
+                SnapshotDownloader::new(bucket_config, tenant_id, output_path, concurrency).await?;
             downloader.download().await
         }
         Command::PageserverPhysicalGc {
@@ -199,5 +209,20 @@ async fn main() -> anyhow::Result<()> {
             println!("{}", serde_json::to_string(&summary).unwrap());
             Ok(())
         }
+        Command::FindLargeObjects {
+            min_size,
+            ignore_deltas,
+            concurrency,
+        } => {
+            let summary = find_large_objects::find_large_objects(
+                bucket_config,
+                min_size,
+                ignore_deltas,
+                concurrency,
+            )
+            .await?;
+            println!("{}", serde_json::to_string(&summary).unwrap());
+            Ok(())
+        }
     }
 }
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 014643312807..fb8fbc1635ae 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -160,7 +160,7 @@ pub async fn pageserver_physical_gc(
     min_age: Duration,
     mode: GcMode,
 ) -> anyhow::Result<GcSummary> {
-    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+    let (s3_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
 
     let tenants = if tenant_ids.is_empty() {
         futures::future::Either::Left(stream_tenants(&s3_client, &target))
diff --git a/storage_scrubber/src/scan_pageserver_metadata.rs b/storage_scrubber/src/scan_pageserver_metadata.rs
index af74ffa4cdbd..df4f29acf72b 100644
--- a/storage_scrubber/src/scan_pageserver_metadata.rs
+++ b/storage_scrubber/src/scan_pageserver_metadata.rs
@@ -199,7 +199,7 @@ pub async fn scan_metadata(
     bucket_config: BucketConfig,
     tenant_ids: Vec<TenantShardId>,
 ) -> anyhow::Result<MetadataSummary> {
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver)?;
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Pageserver).await?;
 
     let tenants = if tenant_ids.is_empty() {
         futures::future::Either::Left(stream_tenants(&s3_client, &target))
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 24051b03de08..553adf8f468e 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -106,7 +106,7 @@ pub async fn scan_safekeeper_metadata(
     let timelines = client.query(&query, &[]).await?;
     info!("loaded {} timelines", timelines.len());
 
-    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper)?;
+    let (s3_client, target) = init_remote(bucket_config, NodeKind::Safekeeper).await?;
     let console_config = ConsoleConfig::from_env()?;
     let cloud_admin_api_client = CloudAdminApiClient::new(console_config);
 
diff --git a/storage_scrubber/src/tenant_snapshot.rs b/storage_scrubber/src/tenant_snapshot.rs
index 450b337235f0..5a75f8d40ecf 100644
--- a/storage_scrubber/src/tenant_snapshot.rs
+++ b/storage_scrubber/src/tenant_snapshot.rs
@@ -28,13 +28,13 @@ pub struct SnapshotDownloader {
 }
 
 impl SnapshotDownloader {
-    pub fn new(
+    pub async fn new(
         bucket_config: BucketConfig,
         tenant_id: TenantId,
         output_path: Utf8PathBuf,
         concurrency: usize,
     ) -> anyhow::Result<Self> {
-        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver)?;
+        let (s3_client, s3_root) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
         Ok(Self {
             s3_client,
             s3_root,
@@ -215,7 +215,8 @@ impl SnapshotDownloader {
     }
 
     pub async fn download(&self) -> anyhow::Result<()> {
-        let (s3_client, target) = init_remote(self.bucket_config.clone(), NodeKind::Pageserver)?;
+        let (s3_client, target) =
+            init_remote(self.bucket_config.clone(), NodeKind::Pageserver).await?;
 
         // Generate a stream of TenantShardId
         let shards = stream_tenant_shards(&s3_client, &target, self.tenant_id).await?;
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 41fa8e679f28..c019cbbc7790 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -144,6 +144,8 @@ def histogram(prefix_without_trailing_underscore: str) -> List[str]:
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",
     "pageserver_smgr_query_seconds_sum",
+    "pageserver_archive_size",
+    "pageserver_pitr_history_size",
     "pageserver_storage_operations_seconds_count_total",
     "pageserver_storage_operations_seconds_sum_total",
     "pageserver_evictions_total",
diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
new file mode 100644
index 000000000000..39baf5fab69f
--- /dev/null
+++ b/test_runner/fixtures/neon_api.py
@@ -0,0 +1,263 @@
+from __future__ import annotations
+
+import time
+from typing import TYPE_CHECKING, cast
+
+import requests
+
+if TYPE_CHECKING:
+    from typing import Any, Dict, Literal, Optional, Union
+
+    from fixtures.pg_version import PgVersion
+
+
+def connection_parameters_to_env(params: Dict[str, str]) -> Dict[str, str]:
+    return {
+        "PGHOST": params["host"],
+        "PGDATABASE": params["database"],
+        "PGUSER": params["role"],
+        "PGPASSWORD": params["password"],
+    }
+
+
+class NeonAPI:
+    def __init__(self, neon_api_key: str, neon_api_base_url: str):
+        self.__neon_api_key = neon_api_key
+        self.__neon_api_base_url = neon_api_base_url.strip("/")
+
+    def __request(
+        self, method: Union[str, bytes], endpoint: str, **kwargs: Any
+    ) -> requests.Response:
+        if "headers" not in kwargs:
+            kwargs["headers"] = {}
+        kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}"
+
+        return requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
+
+    def create_project(
+        self,
+        pg_version: Optional[PgVersion] = None,
+        name: Optional[str] = None,
+        branch_name: Optional[str] = None,
+        branch_role_name: Optional[str] = None,
+        branch_database_name: Optional[str] = None,
+    ) -> Dict[str, Any]:
+        data: Dict[str, Any] = {
+            "project": {
+                "branch": {},
+            },
+        }
+        if name:
+            data["project"]["name"] = name
+        if pg_version:
+            data["project"]["pg_version"] = int(pg_version)
+        if branch_name:
+            data["project"]["branch"]["name"] = branch_name
+        if branch_role_name:
+            data["project"]["branch"]["role_name"] = branch_role_name
+        if branch_database_name:
+            data["project"]["branch"]["database_name"] = branch_database_name
+
+        resp = self.__request(
+            "POST",
+            "/projects",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+            json=data,
+        )
+
+        assert resp.status_code == 201
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_project_details(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+        )
+        assert resp.status_code == 200
+        return cast("Dict[str, Any]", resp.json())
+
+    def delete_project(
+        self,
+        project_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "DELETE",
+            f"/projects/{project_id}",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def start_endpoint(
+        self,
+        project_id: str,
+        endpoint_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints/{endpoint_id}/start",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def suspend_endpoint(
+        self,
+        project_id: str,
+        endpoint_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints/{endpoint_id}/suspend",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def restart_endpoint(
+        self,
+        project_id: str,
+        endpoint_id: str,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints/{endpoint_id}/restart",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def create_endpoint(
+        self,
+        project_id: str,
+        branch_id: str,
+        endpoint_type: Literal["read_write", "read_only"],
+        settings: Dict[str, Any],
+    ) -> Dict[str, Any]:
+        data: Dict[str, Any] = {
+            "endpoint": {
+                "branch_id": branch_id,
+            },
+        }
+
+        if endpoint_type:
+            data["endpoint"]["type"] = endpoint_type
+        if settings:
+            data["endpoint"]["settings"] = settings
+
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/endpoints",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+            json=data,
+        )
+
+        assert resp.status_code == 201
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_connection_uri(
+        self,
+        project_id: str,
+        branch_id: Optional[str] = None,
+        endpoint_id: Optional[str] = None,
+        database_name: str = "neondb",
+        role_name: str = "neondb_owner",
+        pooled: bool = True,
+    ) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/connection_uri",
+            params={
+                "branch_id": branch_id,
+                "endpoint_id": endpoint_id,
+                "database_name": database_name,
+                "role_name": role_name,
+                "pooled": pooled,
+            },
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_branches(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/branches",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_endpoints(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/endpoints",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def get_operations(self, project_id: str) -> Dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/operations",
+            headers={
+                "Accept": "application/json",
+                "Authorization": f"Bearer {self.__neon_api_key}",
+            },
+        )
+
+        assert resp.status_code == 200
+
+        return cast("Dict[str, Any]", resp.json())
+
+    def wait_for_operation_to_finish(self, project_id: str):
+        has_running = True
+        while has_running:
+            has_running = False
+            operations = self.get_operations(project_id)["operations"]
+            for op in operations:
+                if op["status"] in {"scheduling", "running", "cancelling"}:
+                    has_running = True
+            time.sleep(0.5)
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6bfe1afd1f2a..5ca31644a910 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -87,6 +87,8 @@
 )
 from fixtures.utils import AuxFileStore as AuxFileStore  # reexport
 
+from .neon_api import NeonAPI
+
 """
 This file contains pytest fixtures. A fixture is a test resource that can be
 summoned by placing its name in the test's arguments.
@@ -184,6 +186,25 @@ def versioned_pg_distrib_dir(pg_distrib_dir: Path, pg_version: PgVersion) -> Ite
     yield versioned_dir
 
 
+@pytest.fixture(scope="session")
+def neon_api_key() -> str:
+    api_key = os.getenv("NEON_API_KEY")
+    if not api_key:
+        raise AssertionError("Set the NEON_API_KEY environment variable")
+
+    return api_key
+
+
+@pytest.fixture(scope="session")
+def neon_api_base_url() -> str:
+    return os.getenv("NEON_API_BASE_URL", "https://console-stage.neon.build/api/v2")
+
+
+@pytest.fixture(scope="session")
+def neon_api(neon_api_key: str, neon_api_base_url: str) -> NeonAPI:
+    return NeonAPI(neon_api_key, neon_api_base_url)
+
+
 def shareable_scope(fixture_name: str, config: Config) -> Literal["session", "function"]:
     """Return either session of function scope, depending on TEST_SHARED_FIXTURES envvar.
 
@@ -471,6 +492,7 @@ def __init__(
         pageserver_virtual_file_io_engine: Optional[str] = None,
         pageserver_aux_file_policy: Optional[AuxFileStore] = None,
         pageserver_default_tenant_config_compaction_algorithm: Optional[Dict[str, Any]] = None,
+        safekeeper_extra_opts: Optional[list[str]] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -536,6 +558,8 @@ def __init__(
 
         self.pageserver_aux_file_policy = pageserver_aux_file_policy
 
+        self.safekeeper_extra_opts = safekeeper_extra_opts
+
         assert test_name.startswith(
             "test_"
         ), "Unexpectedly instantiated from outside a test function"
@@ -943,6 +967,8 @@ def __exit__(
                 # if the test threw an exception, don't check for errors
                 # as a failing assertion would cause the cleanup below to fail
                 ps_assert_metric_no_errors=(exc_type is None),
+                # do not fail on endpoint errors to allow the rest of cleanup to proceed
+                fail_on_endpoint_errors=False,
             )
             cleanup_error = None
 
@@ -1167,8 +1193,12 @@ def __init__(self, config: NeonEnvBuilder):
             if config.auth_enabled:
                 sk_cfg["auth_enabled"] = True
             if self.safekeepers_remote_storage is not None:
-                sk_cfg["remote_storage"] = self.safekeepers_remote_storage.to_toml_inline_table()
-            self.safekeepers.append(Safekeeper(env=self, id=id, port=port))
+                sk_cfg[
+                    "remote_storage"
+                ] = self.safekeepers_remote_storage.to_toml_inline_table().strip()
+            self.safekeepers.append(
+                Safekeeper(env=self, id=id, port=port, extra_opts=config.safekeeper_extra_opts)
+            )
             cfg["safekeepers"].append(sk_cfg)
 
         log.info(f"Config: {cfg}")
@@ -1212,11 +1242,11 @@ def start(self, timeout_in_seconds: Optional[int] = None):
         for f in futs:
             f.result()
 
-    def stop(self, immediate=False, ps_assert_metric_no_errors=False):
+    def stop(self, immediate=False, ps_assert_metric_no_errors=False, fail_on_endpoint_errors=True):
         """
         After this method returns, there should be no child processes running.
         """
-        self.endpoints.stop_all()
+        self.endpoints.stop_all(fail_on_endpoint_errors)
 
         # Stop storage controller before pageservers: we don't want it to spuriously
         # detect a pageserver "failure" during test teardown
@@ -1933,6 +1963,7 @@ def endpoint_reconfigure(
         endpoint_id: str,
         tenant_id: Optional[TenantId] = None,
         pageserver_id: Optional[int] = None,
+        safekeepers: Optional[List[int]] = None,
         check_return_code=True,
     ) -> "subprocess.CompletedProcess[str]":
         args = ["endpoint", "reconfigure", endpoint_id]
@@ -1940,6 +1971,8 @@ def endpoint_reconfigure(
             args.extend(["--tenant-id", str(tenant_id)])
         if pageserver_id is not None:
             args.extend(["--pageserver-id", str(pageserver_id)])
+        if safekeepers is not None:
+            args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
         return self.raw_cli(args, check_return_code=check_return_code)
 
     def endpoint_stop(
@@ -2108,6 +2141,21 @@ def stop(self, immediate: bool = False) -> "NeonStorageController":
             self.running = False
         return self
 
+    @staticmethod
+    def retryable_node_operation(op, ps_id, max_attempts, backoff):
+        while max_attempts > 0:
+            try:
+                op(ps_id)
+                return
+            except StorageControllerApiException as e:
+                max_attempts -= 1
+                log.info(f"Operation failed ({max_attempts} attempts left): {e}")
+
+                if max_attempts == 0:
+                    raise e
+
+                time.sleep(backoff)
+
     @staticmethod
     def raise_api_exception(res: requests.Response):
         try:
@@ -2448,6 +2496,38 @@ def consistency_check(self):
         )
         log.info("storage controller passed consistency check")
 
+    def poll_node_status(
+        self, node_id: int, desired_scheduling_policy: str, max_attempts: int, backoff: int
+    ):
+        """
+        Poll the node status until it reaches 'desired_scheduling_policy' or 'max_attempts' have been exhausted
+        """
+        log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
+        while max_attempts > 0:
+            try:
+                status = self.node_status(node_id)
+                policy = status["scheduling"]
+                if policy == desired_scheduling_policy:
+                    return
+                else:
+                    max_attempts -= 1
+                    log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
+
+                    if max_attempts == 0:
+                        raise AssertionError(
+                            f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
+                        )
+
+                    time.sleep(backoff)
+            except StorageControllerApiException as e:
+                max_attempts -= 1
+                log.info(f"Status call failed ({max_attempts} retries left): {e}")
+
+                if max_attempts == 0:
+                    raise e
+
+                time.sleep(backoff)
+
     def configure_failpoints(self, config_strings: Tuple[str, str] | List[Tuple[str, str]]):
         if isinstance(config_strings, tuple):
             pairs = [config_strings]
@@ -2738,7 +2818,19 @@ def tenant_create(
         if generation is None:
             generation = self.env.storage_controller.attach_hook_issue(tenant_id, self.id)
         client = self.http_client(auth_token=auth_token)
-        return client.tenant_create(tenant_id, conf, generation=generation)
+
+        conf = conf or {}
+
+        client.tenant_location_conf(
+            tenant_id,
+            {
+                "mode": "AttachedSingle",
+                "generation": generation,
+                "tenant_conf": conf,
+                "secondary_conf": None,
+            },
+        )
+        return tenant_id
 
     def list_layers(
         self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId
@@ -2796,14 +2888,21 @@ def _build_env(self, env_add: Optional[Env]) -> Env:
         env.update(env_add)
         return env
 
-    def run(
+    def _log_env(self, env: dict[str, str]) -> None:
+        env_s = {}
+        for k, v in env.items():
+            if k.startswith("PG") and k != "PGPASSWORD":
+                env_s[k] = v
+        log.debug(f"Environment: {env_s}")
+
+    def run_nonblocking(
         self,
         command: List[str],
         env: Optional[Env] = None,
         cwd: Optional[Union[str, Path]] = None,
-    ):
+    ) -> subprocess.Popen[Any]:
         """
-        Run one of the postgres binaries.
+        Run one of the postgres binaries, not waiting for it to finish
 
         The command should be in list form, e.g. ['pgbench', '-p', '55432']
 
@@ -2814,11 +2913,34 @@ def run(
 
         If you want stdout/stderr captured to files, use `run_capture` instead.
         """
-
         self._fixpath(command)
         log.info(f"Running command '{' '.join(command)}'")
         env = self._build_env(env)
-        subprocess.run(command, env=env, cwd=cwd, check=True)
+        self._log_env(env)
+        return subprocess.Popen(command, env=env, cwd=cwd, stdout=subprocess.PIPE, text=True)
+
+    def run(
+        self,
+        command: List[str],
+        env: Optional[Env] = None,
+        cwd: Optional[Union[str, Path]] = None,
+    ) -> None:
+        """
+        Run one of the postgres binaries, waiting for it to finish
+
+        The command should be in list form, e.g. ['pgbench', '-p', '55432']
+
+        All the necessary environment variables will be set.
+
+        If the first argument (the command name) doesn't include a path (no '/'
+        characters present), then it will be edited to include the correct path.
+
+        If you want stdout/stderr captured to files, use `run_capture` instead.
+        """
+        proc = self.run_nonblocking(command, env, cwd)
+        proc.wait()
+        if proc.returncode != 0:
+            raise subprocess.CalledProcessError(proc.returncode, proc.args)
 
     def run_capture(
         self,
@@ -2838,6 +2960,7 @@ def run_capture(
         self._fixpath(command)
         log.info(f"Running command '{' '.join(command)}'")
         env = self._build_env(env)
+        self._log_env(env)
         base_path, _, _ = subprocess_capture(
             self.log_dir,
             command,
@@ -3476,7 +3599,6 @@ def __init__(
     ):
         super().__init__(host="localhost", port=pg_port, user="cloud_admin", dbname="postgres")
         self.env = env
-        self.running = False
         self.branch_name: Optional[str] = None  # dubious
         self.endpoint_id: Optional[str] = None  # dubious, see asserts below
         self.pgdata_dir: Optional[str] = None  # Path to computenode PGDATA
@@ -3484,6 +3606,7 @@ def __init__(
         self.pg_port = pg_port
         self.http_port = http_port
         self.check_stop_result = check_stop_result
+        # passed to endpoint create and endpoint reconfigure
         self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
         # path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
 
@@ -3552,6 +3675,7 @@ def start(
         self,
         remote_ext_config: Optional[str] = None,
         pageserver_id: Optional[int] = None,
+        safekeepers: Optional[List[int]] = None,
         allow_multiple: bool = False,
     ) -> "Endpoint":
         """
@@ -3561,6 +3685,11 @@ def start(
 
         assert self.endpoint_id is not None
 
+        # If `safekeepers` is not None, they are remember them as active and use
+        # in the following commands.
+        if safekeepers is not None:
+            self.active_safekeepers = safekeepers
+
         log.info(f"Starting postgres endpoint {self.endpoint_id}")
 
         self.env.neon_cli.endpoint_start(
@@ -3624,9 +3753,17 @@ def edit_hba(self, hba: List[str]):
     def is_running(self):
         return self._running._value > 0
 
-    def reconfigure(self, pageserver_id: Optional[int] = None):
+    def reconfigure(
+        self, pageserver_id: Optional[int] = None, safekeepers: Optional[List[int]] = None
+    ):
         assert self.endpoint_id is not None
-        self.env.neon_cli.endpoint_reconfigure(self.endpoint_id, self.tenant_id, pageserver_id)
+        # If `safekeepers` is not None, they are remember them as active and use
+        # in the following commands.
+        if safekeepers is not None:
+            self.active_safekeepers = safekeepers
+        self.env.neon_cli.endpoint_reconfigure(
+            self.endpoint_id, self.tenant_id, pageserver_id, self.active_safekeepers
+        )
 
     def respec(self, **kwargs):
         """Update the endpoint.json file used by control_plane."""
@@ -3821,13 +3958,23 @@ def create(
             pageserver_id=pageserver_id,
         )
 
-    def stop_all(self) -> "EndpointFactory":
+    def stop_all(self, fail_on_error=True) -> "EndpointFactory":
+        exception = None
         for ep in self.endpoints:
-            ep.stop()
+            try:
+                ep.stop()
+            except Exception as e:
+                log.error(f"Failed to stop endpoint {ep.endpoint_id}: {e}")
+                exception = e
+
+        if fail_on_error and exception is not None:
+            raise exception
 
         return self
 
-    def new_replica(self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]]):
+    def new_replica(
+        self, origin: Endpoint, endpoint_id: str, config_lines: Optional[List[str]] = None
+    ):
         branch_name = origin.branch_name
         assert origin in self.endpoints
         assert branch_name is not None
@@ -3874,16 +4021,28 @@ class Safekeeper(LogUtils):
     id: int
     running: bool = False
 
-    def __init__(self, env: NeonEnv, port: SafekeeperPort, id: int, running: bool = False):
+    def __init__(
+        self,
+        env: NeonEnv,
+        port: SafekeeperPort,
+        id: int,
+        running: bool = False,
+        extra_opts: Optional[List[str]] = None,
+    ):
         self.env = env
         self.port = port
         self.id = id
         self.running = running
         self.logfile = Path(self.data_dir) / f"safekeeper-{id}.log"
+        self.extra_opts = extra_opts
 
     def start(
         self, extra_opts: Optional[List[str]] = None, timeout_in_seconds: Optional[int] = None
     ) -> "Safekeeper":
+        if extra_opts is None:
+            # Apply either the extra_opts passed in, or the ones from our constructor: we do not merge the two.
+            extra_opts = self.extra_opts
+
         assert self.running is False
         self.env.neon_cli.safekeeper_start(
             self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 794961271418..03aee9e5c597 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -220,34 +220,6 @@ def tenant_list(self) -> List[Dict[Any, Any]]:
         assert isinstance(res_json, list)
         return res_json
 
-    def tenant_create(
-        self,
-        new_tenant_id: Union[TenantId, TenantShardId],
-        conf: Optional[Dict[str, Any]] = None,
-        generation: Optional[int] = None,
-    ) -> TenantId:
-        if conf is not None:
-            assert "new_tenant_id" not in conf.keys()
-
-        body: Dict[str, Any] = {
-            "new_tenant_id": str(new_tenant_id),
-            **(conf or {}),
-        }
-
-        if generation is not None:
-            body.update({"generation": generation})
-
-        res = self.post(
-            f"http://localhost:{self.port}/v1/tenant",
-            json=body,
-        )
-        self.verbose_error(res)
-        if res.status_code == 409:
-            raise Exception(f"could not create tenant: already exists for id {new_tenant_id}")
-        new_tenant_id = res.json()
-        assert isinstance(new_tenant_id, str)
-        return TenantId(new_tenant_id)
-
     def tenant_attach(
         self,
         tenant_id: Union[TenantId, TenantShardId],
@@ -627,6 +599,22 @@ def timeline_get_lsn_by_timestamp(
         res_json = res.json()
         return res_json
 
+    def timeline_lsn_lease(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn
+    ):
+        data = {
+            "lsn": str(lsn),
+        }
+
+        log.info(f"Requesting lsn lease for {lsn=}, {tenant_id=}, {timeline_id=}")
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease",
+            json=data,
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        return res_json
+
     def timeline_get_timestamp_of_lsn(
         self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, lsn: Lsn
     ):
diff --git a/test_runner/fixtures/pageserver/many_tenants.py b/test_runner/fixtures/pageserver/many_tenants.py
index 8730d8ef751d..c437258c6f87 100644
--- a/test_runner/fixtures/pageserver/many_tenants.py
+++ b/test_runner/fixtures/pageserver/many_tenants.py
@@ -42,10 +42,6 @@ def single_timeline(
 
     log.info("detach template tenant form pageserver")
     env.pageserver.tenant_detach(template_tenant)
-    env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
-        ".*Dropped remote consistent LSN updates.*",
-    )
 
     log.info(f"duplicating template tenant {ncopies} times in S3")
     tenants = fixtures.pageserver.remote_storage.duplicate_tenant(env, template_tenant, ncopies)
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 60535b759261..b75a480a637e 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -198,7 +198,7 @@ def wait_for_last_record_lsn(
     lsn: Lsn,
 ) -> Lsn:
     """waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
-    for i in range(100):
+    for i in range(1000):
         current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
         if current_lsn >= lsn:
             return current_lsn
diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
index 0ff9c8fdaa98..33848b06d35c 100644
--- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -55,10 +55,6 @@ def setup_template(env: NeonEnv):
         }
         template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
         env.pageserver.tenant_detach(template_tenant)
-        env.pageserver.allowed_errors.append(
-            # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
-            ".*Dropped remote consistent LSN updates.*",
-        )
         env.pageserver.tenant_attach(template_tenant, config)
         ep = env.endpoints.create_start("main", tenant_id=template_tenant)
         ep.safe_psql("create table foo(b text)")
diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
index b66db4d0ab72..b41ae601975f 100644
--- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
+++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py
@@ -86,10 +86,6 @@ def setup_tenant_template(env: NeonEnv, n_txns: int):
 
     template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
     env.pageserver.tenant_detach(template_tenant)
-    env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
-        ".*Dropped remote consistent LSN updates.*",
-    )
     env.pageserver.tenant_attach(template_tenant, config)
 
     ps_http = env.pageserver.http_client()
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 1d579214b0c5..60861cf939b8 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -1,4 +1,5 @@
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, Tuple
 
@@ -17,30 +18,74 @@
     setup_pageserver_with_tenants,
 )
 
+# The following tests use pagebench "getpage at latest LSN" to characterize the throughput of the pageserver.
+# originally there was a single test named `test_pageserver_max_throughput_getpage_at_latest_lsn``
+# so you still see some references to this name in the code.
+# To avoid recreating the snapshots for each test, we continue to use the name `max_throughput_latest_lsn`
+# for some files and metrics.
+
+
+# For reference, the space usage of the snapshots:
+# sudo du -hs /instance_store/neon/test_output/shared-snapshots/*
+# 416G	/instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-500-13
+@pytest.mark.parametrize("duration", [60 * 60])
+@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
+@pytest.mark.parametrize("n_tenants", [500])
+@pytest.mark.timeout(10000)
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
+)
+def test_pageserver_characterize_throughput_with_n_tenants(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    n_tenants: int,
+    pgbench_scale: int,
+    duration: int,
+):
+    setup_and_run_pagebench_benchmark(
+        neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, 1
+    )
+
 
 # For reference, the space usage of the snapshots:
-# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots
-# 137G    /instance_store/test_output/shared-snapshots
-# admin@ip-172-31-13-23:[~/neon-main]: sudo du -hs /instance_store/test_output/shared-snapshots/*
-# 1.8G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-13
-# 1.1G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-1-6
-# 8.5G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-13
-# 5.1G    /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-10-6
-# 76G     /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-13
-# 46G     /instance_store/test_output/shared-snapshots/max_throughput_latest_lsn-100-6
-@pytest.mark.parametrize("duration", [30])
-@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(s) for s in [100, 200]])
-@pytest.mark.parametrize("n_tenants", [1, 10])
-@pytest.mark.timeout(
-    10000
-)  # TODO: this value is just "a really high number"; have this per instance type
-def test_pageserver_max_throughput_getpage_at_latest_lsn(
+# sudo du -hs /instance_store/neon/test_output/shared-snapshots/*
+# 19G	/instance_store/neon/test_output/shared-snapshots/max_throughput_latest_lsn-1-136
+@pytest.mark.parametrize("duration", [20 * 60])
+@pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)])
+# we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability
+# we use 64 clients because typically for a high number of connections we recommend the connection pooler
+# which by default uses 64 connections
+@pytest.mark.parametrize("n_clients", [1, 64])
+@pytest.mark.parametrize("n_tenants", [1])
+@pytest.mark.timeout(2400)
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
+)
+def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
     neon_env_builder: NeonEnvBuilder,
     zenbenchmark: NeonBenchmarker,
     pg_bin: PgBin,
     n_tenants: int,
     pgbench_scale: int,
     duration: int,
+    n_clients: int,
+):
+    setup_and_run_pagebench_benchmark(
+        neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients
+    )
+
+
+def setup_and_run_pagebench_benchmark(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    pg_bin: PgBin,
+    n_tenants: int,
+    pgbench_scale: int,
+    duration: int,
+    n_clients: int,
 ):
     def record(metric, **kwargs):
         zenbenchmark.record(
@@ -55,6 +100,7 @@ def record(metric, **kwargs):
             "n_tenants": (n_tenants, {"unit": ""}),
             "pgbench_scale": (pgbench_scale, {"unit": ""}),
             "duration": (duration, {"unit": "s"}),
+            "n_clients": (n_clients, {"unit": ""}),
         }
     )
 
@@ -96,7 +142,7 @@ def setup_wrapper(env: NeonEnv):
         r".*query handler for.*pagestream.*failed: unexpected message: CopyFail during COPY.*"
     )
 
-    run_benchmark_max_throughput_latest_lsn(env, pg_bin, record, duration)
+    run_pagebench_benchmark(env, pg_bin, record, duration, n_clients)
 
 
 def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
@@ -118,10 +164,6 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
     }
     template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True)
     env.pageserver.tenant_detach(template_tenant)
-    env.pageserver.allowed_errors.append(
-        # tenant detach causes this because the underlying attach-hook removes the tenant from storage controller entirely
-        ".*Dropped remote consistent LSN updates.*",
-    )
     env.pageserver.tenant_attach(template_tenant, config)
     ps_http = env.pageserver.http_client()
     with env.endpoints.create_start("main", tenant_id=template_tenant) as ep:
@@ -157,8 +199,8 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int):
     return (template_tenant, template_timeline, config)
 
 
-def run_benchmark_max_throughput_latest_lsn(
-    env: NeonEnv, pg_bin: PgBin, record, duration_secs: int
+def run_pagebench_benchmark(
+    env: NeonEnv, pg_bin: PgBin, record, duration_secs: int, n_clients: int
 ):
     """
     Benchmark `env.pageserver` for max throughput @ latest LSN and record results in `zenbenchmark`.
@@ -172,6 +214,8 @@ def run_benchmark_max_throughput_latest_lsn(
         ps_http.base_url,
         "--page-service-connstring",
         env.pageserver.connstr(password=None),
+        "--num-clients",
+        str(n_clients),
         "--runtime",
         f"{duration_secs}s",
         # don't specify the targets explicitly, let pagebench auto-discover them
diff --git a/test_runner/performance/pageserver/util.py b/test_runner/performance/pageserver/util.py
index 92e05663ce20..88296a7fbdec 100644
--- a/test_runner/performance/pageserver/util.py
+++ b/test_runner/performance/pageserver/util.py
@@ -22,7 +22,7 @@ def ensure_pageserver_ready_for_benchmarking(env: NeonEnv, n_tenants: int):
 
     log.info("wait for all tenants to become active")
     wait_until_all_tenants_state(
-        ps_http, "Active", iterations=n_tenants, period=1, http_error_ok=False
+        ps_http, "Active", iterations=10 + n_tenants, period=1, http_error_ok=False
     )
 
     # ensure all layers are resident for predictiable performance
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 7d11facc2949..5ab83dd31d0b 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -1,8 +1,24 @@
+from __future__ import annotations
+
 import time
+import traceback
+from typing import TYPE_CHECKING
 
+import psycopg2
+import psycopg2.extras
 import pytest
+from fixtures.benchmark_fixture import MetricReport
+from fixtures.common_types import Lsn
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import AuxFileStore, NeonEnv, PgBin, logical_replication_sync
+from fixtures.neon_api import connection_parameters_to_env
+from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
+from fixtures.pg_version import PgVersion
+
+if TYPE_CHECKING:
+    from fixtures.benchmark_fixture import NeonBenchmarker
+    from fixtures.neon_api import NeonAPI
+    from fixtures.neon_fixtures import NeonEnv, PgBin
+    from fixtures.pg_version import PgVersion
 
 
 @pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
@@ -26,7 +42,6 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
     vanilla_pg.safe_psql("truncate table pgbench_history")
 
     connstr = endpoint.connstr().replace("'", "''")
-    print(f"connstr='{connstr}'")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
 
     # Wait logical replication channel to be established
@@ -42,3 +57,286 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
     sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
     sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
     assert sum_master == sum_replica
+
+
+def check_pgbench_still_running(pgbench, label=""):
+    rc = pgbench.poll()
+    if rc is not None:
+        raise RuntimeError(f"{label} pgbench terminated early with return code {rc}")
+
+
+def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600):
+    start = time.time()
+    pub_cur.execute("SELECT pg_current_wal_flush_lsn()")
+    pub_lsn = Lsn(pub_cur.fetchall()[0][0])
+    while (time.time() - start) < timeout_sec:
+        sub_cur.execute("SELECT latest_end_lsn FROM pg_catalog.pg_stat_subscription")
+        res = sub_cur.fetchall()[0][0]
+        if res:
+            log.info(f"subscriber_lsn={res}")
+            sub_lsn = Lsn(res)
+            log.info(f"Subscriber LSN={sub_lsn}, publisher LSN={pub_lsn}")
+            if sub_lsn >= pub_lsn:
+                return time.time() - start
+        time.sleep(0.5)
+    raise TimeoutError(f"Logical replication sync took more than {timeout_sec} sec")
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_subscriber_lag(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
+    on subscriber. Periodically restarts subscriber while still running the inserts, and
+    measures how long sync takes after restart.
+    """
+    test_duration_min = 60
+    sync_interval_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    pub_project = neon_api.create_project(pg_version)
+    pub_project_id = pub_project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(pub_project_id)
+    error_occurred = False
+    try:
+        sub_project = neon_api.create_project(pg_version)
+        sub_project_id = sub_project["project"]["id"]
+        sub_endpoint_id = sub_project["endpoints"][0]["id"]
+        neon_api.wait_for_operation_to_finish(sub_project_id)
+        try:
+            pub_env = connection_parameters_to_env(
+                pub_project["connection_uris"][0]["connection_parameters"]
+            )
+            sub_env = connection_parameters_to_env(
+                sub_project["connection_uris"][0]["connection_parameters"]
+            )
+            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
+            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+            pub_conn = psycopg2.connect(pub_connstr)
+            sub_conn = psycopg2.connect(sub_connstr)
+            pub_conn.autocommit = True
+            sub_conn.autocommit = True
+            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                sub_cur.execute("truncate table pgbench_accounts")
+                sub_cur.execute("truncate table pgbench_history")
+
+                pub_cur.execute(
+                    "create publication pub1 for table pgbench_accounts, pgbench_history"
+                )
+                sub_cur.execute(
+                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
+                )
+
+                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+            pub_conn.close()
+            sub_conn.close()
+
+            zenbenchmark.record(
+                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
+            )
+
+            pub_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+            )
+            try:
+                sub_workload = pg_bin.run_nonblocking(
+                    ["pgbench", "-c10", pgbench_duration, "-S"],
+                    env=sub_env,
+                )
+                try:
+                    start = time.time()
+                    while time.time() - start < test_duration_min * 60:
+                        time.sleep(sync_interval_min * 60)
+                        check_pgbench_still_running(pub_workload, "pub")
+                        check_pgbench_still_running(sub_workload, "sub")
+
+                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                            sub_connstr
+                        ) as sub_conn:
+                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                        log.info(f"Replica lagged behind master by {lag} seconds")
+                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+                        sub_workload.terminate()
+                        neon_api.restart_endpoint(
+                            sub_project_id,
+                            sub_endpoint_id,
+                        )
+                        neon_api.wait_for_operation_to_finish(sub_project_id)
+                        sub_workload = pg_bin.run_nonblocking(
+                            ["pgbench", "-c10", pgbench_duration, "-S"],
+                            env=sub_env,
+                        )
+
+                        # Measure storage to make sure replication information isn't bloating storage
+                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        zenbenchmark.record(
+                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+                        zenbenchmark.record(
+                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+
+                finally:
+                    sub_workload.terminate()
+            finally:
+                pub_workload.terminate()
+        except Exception as e:
+            error_occurred = True
+            log.error(f"Caught exception {e}")
+            log.error(traceback.format_exc())
+        finally:
+            if not error_occurred:
+                neon_api.delete_project(sub_project_id)
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(pub_project_id)
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_publisher_restart(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Creates a publisher and subscriber, runs pgbench inserts on publisher and pgbench selects
+    on subscriber. Periodically restarts publisher (to exercise on-demand WAL download), and
+    measures how long sync takes after restart.
+    """
+    test_duration_min = 60
+    sync_interval_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    pub_project = neon_api.create_project(pg_version)
+    pub_project_id = pub_project["project"]["id"]
+    pub_endpoint_id = pub_project["endpoints"][0]["id"]
+    neon_api.wait_for_operation_to_finish(pub_project_id)
+    error_occurred = False
+    try:
+        sub_project = neon_api.create_project(pg_version)
+        sub_project_id = sub_project["project"]["id"]
+        neon_api.wait_for_operation_to_finish(sub_project_id)
+        try:
+            pub_env = connection_parameters_to_env(
+                pub_project["connection_uris"][0]["connection_parameters"]
+            )
+            sub_env = connection_parameters_to_env(
+                sub_project["connection_uris"][0]["connection_parameters"]
+            )
+            pub_connstr = pub_project["connection_uris"][0]["connection_uri"]
+            sub_connstr = sub_project["connection_uris"][0]["connection_uri"]
+
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=pub_env)
+            pg_bin.run_capture(["pgbench", "-i", "-s100"], env=sub_env)
+
+            pub_conn = psycopg2.connect(pub_connstr)
+            sub_conn = psycopg2.connect(sub_connstr)
+            pub_conn.autocommit = True
+            sub_conn.autocommit = True
+            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                sub_cur.execute("truncate table pgbench_accounts")
+                sub_cur.execute("truncate table pgbench_history")
+
+                pub_cur.execute(
+                    "create publication pub1 for table pgbench_accounts, pgbench_history"
+                )
+                sub_cur.execute(
+                    f"create subscription sub1 connection '{pub_connstr}' publication pub1"
+                )
+
+                initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+            pub_conn.close()
+            sub_conn.close()
+
+            zenbenchmark.record(
+                "initial_sync_lag", initial_sync_lag, "s", MetricReport.LOWER_IS_BETTER
+            )
+
+            pub_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=pub_env
+            )
+            try:
+                sub_workload = pg_bin.run_nonblocking(
+                    ["pgbench", "-c10", pgbench_duration, "-S"],
+                    env=sub_env,
+                )
+                try:
+                    start = time.time()
+                    while time.time() - start < test_duration_min * 60:
+                        time.sleep(sync_interval_min * 60)
+                        check_pgbench_still_running(pub_workload, "pub")
+                        check_pgbench_still_running(sub_workload, "sub")
+
+                        pub_workload.terminate()
+                        neon_api.restart_endpoint(
+                            pub_project_id,
+                            pub_endpoint_id,
+                        )
+                        neon_api.wait_for_operation_to_finish(pub_project_id)
+                        pub_workload = pg_bin.run_nonblocking(
+                            ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
+                            env=pub_env,
+                        )
+                        with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
+                            sub_connstr
+                        ) as sub_conn:
+                            with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                                lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                        log.info(f"Replica lagged behind master by {lag} seconds")
+                        zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+
+                        # Measure storage to make sure replication information isn't bloating storage
+                        sub_storage = neon_api.get_project_details(sub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        pub_storage = neon_api.get_project_details(pub_project_id)["project"][
+                            "synthetic_storage_size"
+                        ]
+                        zenbenchmark.record(
+                            "sub_storage", sub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+                        zenbenchmark.record(
+                            "pub_storage", pub_storage, "B", MetricReport.LOWER_IS_BETTER
+                        )
+
+                finally:
+                    sub_workload.terminate()
+            finally:
+                pub_workload.terminate()
+        except Exception as e:
+            error_occurred = True
+            log.error(f"Caught exception {e}")
+            log.error(traceback.format_exc())
+        finally:
+            if not error_occurred:
+                neon_api.delete_project(sub_project_id)
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(pub_project_id)
diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
new file mode 100644
index 000000000000..7e1619721144
--- /dev/null
+++ b/test_runner/performance/test_physical_replication.py
@@ -0,0 +1,296 @@
+from __future__ import annotations
+
+import csv
+import os
+import subprocess
+import time
+import traceback
+from pathlib import Path
+from typing import TYPE_CHECKING
+
+import psycopg2
+import psycopg2.extras
+import pytest
+from fixtures.benchmark_fixture import MetricReport
+from fixtures.common_types import Lsn
+from fixtures.log_helper import log
+from fixtures.neon_api import connection_parameters_to_env
+from fixtures.pg_version import PgVersion
+
+if TYPE_CHECKING:
+    from typing import Any, List, Optional
+
+    from fixtures.benchmark_fixture import NeonBenchmarker
+    from fixtures.neon_api import NeonAPI
+    from fixtures.neon_fixtures import PgBin
+
+
+# Granularity of ~0.5 sec
+def measure_replication_lag(master, replica, timeout_sec=600):
+    start = time.time()
+    master.execute("SELECT pg_current_wal_flush_lsn()")
+    master_lsn = Lsn(master.fetchall()[0][0])
+    while (time.time() - start) < timeout_sec:
+        replica.execute("select pg_last_wal_replay_lsn()")
+        replica_lsn = replica.fetchall()[0][0]
+        if replica_lsn:
+            if Lsn(replica_lsn) >= master_lsn:
+                return time.time() - start
+        time.sleep(0.5)
+    raise TimeoutError(f"Replication sync took more than {timeout_sec} sec")
+
+
+def check_pgbench_still_running(pgbench):
+    rc = pgbench.poll()
+    if rc is not None:
+        raise RuntimeError(f"Pgbench terminated early with return code {rc}")
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_ro_replica_lag(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    test_duration_min = 60
+    sync_interval_min = 10
+
+    pgbench_duration = f"-T{test_duration_min * 60 * 2}"
+
+    project = neon_api.create_project(pg_version)
+    project_id = project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+    error_occurred = False
+    try:
+        branch_id = project["branch"]["id"]
+        master_connstr = project["connection_uris"][0]["connection_uri"]
+        master_env = connection_parameters_to_env(
+            project["connection_uris"][0]["connection_parameters"]
+        )
+
+        replica = neon_api.create_endpoint(
+            project_id,
+            branch_id,
+            endpoint_type="read_only",
+            settings={"pg_settings": {"hot_standby_feedback": "on"}},
+        )
+        replica_env = master_env.copy()
+        replica_env["PGHOST"] = replica["endpoint"]["host"]
+        neon_api.wait_for_operation_to_finish(project_id)
+
+        replica_connstr = neon_api.get_connection_uri(
+            project_id,
+            endpoint_id=replica["endpoint"]["id"],
+        )["uri"]
+
+        pg_bin.run_capture(["pgbench", "-i", "-s100"], env=master_env)
+
+        master_workload = pg_bin.run_nonblocking(
+            ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
+            env=master_env,
+        )
+        try:
+            replica_workload = pg_bin.run_nonblocking(
+                ["pgbench", "-c10", pgbench_duration, "-S"],
+                env=replica_env,
+            )
+            try:
+                start = time.time()
+                while time.time() - start < test_duration_min * 60:
+                    check_pgbench_still_running(master_workload)
+                    check_pgbench_still_running(replica_workload)
+                    time.sleep(sync_interval_min * 60)
+                    with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect(
+                        replica_connstr
+                    ) as conn_replica:
+                        with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica:
+                            lag = measure_replication_lag(cur_master, cur_replica)
+                    log.info(f"Replica lagged behind master by {lag} seconds")
+                    zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
+            finally:
+                replica_workload.terminate()
+        finally:
+            master_workload.terminate()
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception: {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred  # Fail the test if an error occurred
+        neon_api.delete_project(project_id)
+
+
+def report_pgbench_aggregate_intervals(
+    output_dir: Path,
+    prefix: str,
+    zenbenchmark: NeonBenchmarker,
+):
+    for filename in os.listdir(output_dir):
+        if filename.startswith(prefix):
+            # The file will be in the form <prefix>_<node>.<pid>
+            # So we first lop off the .<pid>, and then lop off the prefix and the _
+            node = filename.split(".")[0][len(prefix) + 1 :]
+            with open(output_dir / filename) as f:
+                reader = csv.reader(f, delimiter=" ")
+                for line in reader:
+                    num_transactions = int(line[1])
+                    if num_transactions == 0:
+                        continue
+                    sum_latency = int(line[2])
+                    sum_lag = int(line[3])
+                    zenbenchmark.record(
+                        f"{node}_num_txns", num_transactions, "txns", MetricReport.HIGHER_IS_BETTER
+                    )
+                    zenbenchmark.record(
+                        f"{node}_avg_latency",
+                        sum_latency / num_transactions,
+                        "s",
+                        MetricReport.LOWER_IS_BETTER,
+                    )
+                    zenbenchmark.record(
+                        f"{node}_avg_lag",
+                        sum_lag / num_transactions,
+                        "s",
+                        MetricReport.LOWER_IS_BETTER,
+                    )
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(2 * 60 * 60)
+def test_replication_start_stop(
+    pg_bin: PgBin,
+    test_output_dir: Path,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    """
+    Cycles through different configurations of read replicas being enabled disabled. The whole time,
+    there's a pgbench read/write workload going on the master. For each replica, we either turn it
+    on or off, and see how long it takes to catch up after some set amount of time of replicating
+    the pgbench.
+    """
+
+    prefix = "pgbench_agg"
+    num_replicas = 2
+    configuration_test_time_sec = 10 * 60
+    pgbench_duration = f"-T{2 ** num_replicas * configuration_test_time_sec}"
+    error_occurred = False
+
+    project = neon_api.create_project(pg_version)
+    project_id = project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+    try:
+        branch_id = project["branch"]["id"]
+        master_connstr = project["connection_uris"][0]["connection_uri"]
+        master_env = connection_parameters_to_env(
+            project["connection_uris"][0]["connection_parameters"]
+        )
+
+        replicas = []
+        for _ in range(num_replicas):
+            replicas.append(
+                neon_api.create_endpoint(
+                    project_id,
+                    branch_id,
+                    endpoint_type="read_only",
+                    settings={"pg_settings": {"hot_standby_feedback": "on"}},
+                )
+            )
+            neon_api.wait_for_operation_to_finish(project_id)
+
+        replica_connstr = [
+            neon_api.get_connection_uri(
+                project_id,
+                endpoint_id=replicas[i]["endpoint"]["id"],
+            )["uri"]
+            for i in range(num_replicas)
+        ]
+        replica_env = [master_env.copy() for _ in range(num_replicas)]
+        for i in range(num_replicas):
+            replica_env[i]["PGHOST"] = replicas[i]["endpoint"]["host"]
+
+        pg_bin.run_capture(["pgbench", "-i", "-s10"], env=master_env)
+
+        # Sync replicas
+        with psycopg2.connect(master_connstr) as conn_master:
+            with conn_master.cursor() as cur_master:
+                for i in range(num_replicas):
+                    conn_replica = psycopg2.connect(replica_connstr[i])
+                    measure_replication_lag(cur_master, conn_replica.cursor())
+
+        master_pgbench = pg_bin.run_nonblocking(
+            [
+                "pgbench",
+                "-c10",
+                pgbench_duration,
+                "-Mprepared",
+                "--log",
+                f"--log-prefix={test_output_dir}/{prefix}_master",
+                f"--aggregate-interval={configuration_test_time_sec}",
+            ],
+            env=master_env,
+        )
+        replica_pgbench: List[Optional[subprocess.Popen[Any]]] = [None for _ in range(num_replicas)]
+
+        # Use the bits of iconfig to tell us which configuration we are on. For example
+        # a iconfig of 2 is 10 in binary, indicating replica 0 is suspended and replica 1 is
+        # alive.
+        for iconfig in range((1 << num_replicas) - 1, -1, -1):
+
+            def replica_enabled(iconfig: int = iconfig):
+                return bool((iconfig >> 1) & 1)
+
+            # Change configuration
+            for ireplica in range(num_replicas):
+                if replica_enabled() and replica_pgbench[ireplica] is None:
+                    replica_pgbench[ireplica] = pg_bin.run_nonblocking(
+                        [
+                            "pgbench",
+                            "-c10",
+                            "-S",
+                            pgbench_duration,
+                            "--log",
+                            f"--log-prefix={test_output_dir}/{prefix}_replica_{ireplica}",
+                            f"--aggregate-interval={configuration_test_time_sec}",
+                        ],
+                        env=replica_env[ireplica],
+                    )
+                elif not replica_enabled() and replica_pgbench[ireplica] is not None:
+                    pgb = replica_pgbench[ireplica]
+                    assert pgb is not None
+                    pgb.terminate()
+                    pgb.wait()
+                    replica_pgbench[ireplica] = None
+
+                    neon_api.suspend_endpoint(
+                        project_id,
+                        replicas[ireplica]["endpoint"]["id"],
+                    )
+                    neon_api.wait_for_operation_to_finish(project_id)
+
+            time.sleep(configuration_test_time_sec)
+
+            with psycopg2.connect(master_connstr) as conn_master:
+                with conn_master.cursor() as cur_master:
+                    for ireplica in range(num_replicas):
+                        replica_conn = psycopg2.connect(replica_connstr[ireplica])
+                        lag = measure_replication_lag(cur_master, replica_conn.cursor())
+                        zenbenchmark.record(
+                            f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
+                        )
+                        log.info(
+                            f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
+                        )
+        master_pgbench.terminate()
+    except Exception as e:
+        error_occurred = True
+        log.error(f"Caught exception {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not error_occurred
+        neon_api.delete_project(project_id)
+        # Only report results if we didn't error out
+        report_pgbench_aggregate_intervals(test_output_dir, prefix, zenbenchmark)
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index a4c8c8ac421a..3a6113706fa9 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -1,18 +1,89 @@
 import concurrent.futures
 import random
 import time
+from collections import defaultdict
+from typing import Any, Dict
 
 import pytest
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-)
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pg_version import PgVersion
 
 
+def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[str, int]:
+    """
+    Get the number of shards attached to each node.
+    This function takes into account the intersection of the intent and the observed state.
+    If they do not match, it asserts out.
+    """
+    tenants = env.storage_controller.tenant_list()
+
+    intent = dict()
+    observed = dict()
+
+    tenant_placement: defaultdict[str, Dict[str, Any]] = defaultdict(
+        lambda: {
+            "observed": {"attached": None, "secondary": []},
+            "intent": {"attached": None, "secondary": []},
+        }
+    )
+
+    for t in tenants:
+        for node_id, loc_state in t["observed"]["locations"].items():
+            if (
+                loc_state is not None
+                and "conf" in loc_state
+                and loc_state["conf"] is not None
+                and loc_state["conf"]["mode"]
+                in set(["AttachedSingle", "AttachedMulti", "AttachedStale"])
+            ):
+                observed[t["tenant_shard_id"]] = int(node_id)
+                tenant_placement[t["tenant_shard_id"]]["observed"]["attached"] = int(node_id)
+
+            if (
+                loc_state is not None
+                and "conf" in loc_state
+                and loc_state["conf"] is not None
+                and loc_state["conf"]["mode"] == "Secondary"
+            ):
+                tenant_placement[t["tenant_shard_id"]]["observed"]["secondary"].append(int(node_id))
+
+        if "attached" in t["intent"]:
+            intent[t["tenant_shard_id"]] = t["intent"]["attached"]
+            tenant_placement[t["tenant_shard_id"]]["intent"]["attached"] = t["intent"]["attached"]
+
+        if "secondary" in t["intent"]:
+            tenant_placement[t["tenant_shard_id"]]["intent"]["secondary"] += t["intent"][
+                "secondary"
+            ]
+
+    log.info(f"{tenant_placement=}")
+
+    matching = {
+        tid: intent[tid] for tid in observed if tid in intent and intent[tid] == observed[tid]
+    }
+    assert len(matching) == total_shards
+
+    attached_per_node: defaultdict[str, int] = defaultdict(int)
+    for node_id in matching.values():
+        attached_per_node[node_id] += 1
+
+    return attached_per_node
+
+
+def assert_consistent_balanced_attachments(env: NeonEnv, total_shards):
+    attached_per_node = get_consistent_node_shard_counts(env, total_shards)
+
+    min_shard_count = min(attached_per_node.values())
+    max_shard_count = max(attached_per_node.values())
+
+    flake_factor = 5 / 100
+    assert max_shard_count - min_shard_count <= int(total_shards * flake_factor)
+
+
 @pytest.mark.timeout(3600)  # super long running test: should go down as we optimize
 def test_storage_controller_many_tenants(
     neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure
@@ -44,7 +115,8 @@ def test_storage_controller_many_tenants(
     # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
     compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
 
-    env = neon_env_builder.init_start()
+    env = neon_env_builder.init_configs()
+    neon_env_builder.start()
 
     # We will intentionally stress reconciler concurrrency, which triggers a warning when lots
     # of shards are hitting the delayed path.
@@ -60,14 +132,6 @@ def test_storage_controller_many_tenants(
     )
 
     for ps in env.pageservers:
-        # This can happen because when we do a loop over all pageservers and mark them offline/active,
-        # reconcilers might get cancelled, and the next reconcile can follow a not-so-elegant path of
-        # bumping generation before other attachments are detached.
-        #
-        # We could clean this up by making reconcilers respect the .observed of their predecessor, if
-        # we spawn with a wait for the predecessor.
-        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
         # Storage controller is allowed to drop pageserver requests when the cancellation token
         # for a Reconciler fires.
         ps.allowed_errors.append(".*request was dropped before completing.*")
@@ -79,6 +143,8 @@ def test_storage_controller_many_tenants(
     shard_count = 2
     stripe_size = 1024
 
+    total_shards = tenant_count * shard_count
+
     tenants = set(TenantId.generate() for _i in range(0, tenant_count))
 
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
@@ -195,10 +261,44 @@ def check_memory():
     env.storage_controller.consistency_check()
     check_memory()
 
-    # Restart pageservers: this exercises the /re-attach API
-    for pageserver in env.pageservers:
-        pageserver.stop()
-        pageserver.start()
+    shard_counts = get_consistent_node_shard_counts(env, total_shards)
+    log.info(f"Shard counts before rolling restart: {shard_counts}")
+
+    assert_consistent_balanced_attachments(env, total_shards)
+
+    # Restart pageservers gracefully: this exercises the /re-attach pageserver API
+    # and the storage controller drain and fill API
+    for ps in env.pageservers:
+        env.storage_controller.retryable_node_operation(
+            lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
+        )
+
+        env.storage_controller.poll_node_status(
+            ps.id, "PauseForRestart", max_attempts=24, backoff=5
+        )
+
+        shard_counts = get_consistent_node_shard_counts(env, total_shards)
+        log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
+        # Assert that we've drained the node
+        assert shard_counts[str(ps.id)] == 0
+        # Assert that those shards actually went somewhere
+        assert sum(shard_counts.values()) == total_shards
+
+        ps.restart()
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=1)
+
+        env.storage_controller.retryable_node_operation(
+            lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
+        )
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=24, backoff=5)
+
+        shard_counts = get_consistent_node_shard_counts(env, total_shards)
+        log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
+
+        assert_consistent_balanced_attachments(env, total_shards)
+
+        env.storage_controller.reconcile_until_idle()
+        env.storage_controller.consistency_check()
 
     # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,
     # as they were not offline long enough to trigger any scheduling changes.
diff --git a/test_runner/pg_clients/java/jdbc/Dockerfile b/test_runner/pg_clients/java/jdbc/Dockerfile
index 7e074e07b836..7c2b1b40e091 100644
--- a/test_runner/pg_clients/java/jdbc/Dockerfile
+++ b/test_runner/pg_clients/java/jdbc/Dockerfile
@@ -1,4 +1,4 @@
-FROM openjdk:21
+FROM openjdk:22
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/python/pg8000/requirements.txt b/test_runner/pg_clients/python/pg8000/requirements.txt
index e086a937e68b..099a4ade2c4d 100644
--- a/test_runner/pg_clients/python/pg8000/requirements.txt
+++ b/test_runner/pg_clients/python/pg8000/requirements.txt
@@ -1,2 +1,2 @@
-pg8000==1.30.5
+pg8000==1.31.2
 scramp>=1.4.3
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
index a4a2426b97ec..32c1c52eea44 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.lock
@@ -4,9 +4,9 @@ version = 3
 
 [[package]]
 name = "addr2line"
-version = "0.21.0"
+version = "0.22.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8a30b2e23b9e17a9f90641c7ab1549cd9b44f296d3ccbf309d2863cfe398a0cb"
+checksum = "6e4503c46a5c0c7844e948c9a4d6acd9f50cccb4de1c48eb9e291ea17470c678"
 dependencies = [
  "gimli",
 ]
@@ -19,9 +19,9 @@ checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe"
 
 [[package]]
 name = "async-trait"
-version = "0.1.77"
+version = "0.1.80"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c980ee35e870bd1a4d2c8294d4c04d0499e67bca1e4b5cefcc693c2fa00caea9"
+checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -30,15 +30,15 @@ dependencies = [
 
 [[package]]
 name = "autocfg"
-version = "1.1.0"
+version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
+checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
 
 [[package]]
 name = "backtrace"
-version = "0.3.69"
+version = "0.3.73"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2089b7e3f35b9dd2d0ed921ead4f6d318c27680d4a5bd167b3ee120edb105837"
+checksum = "5cc23269a4f8976d0a4d2e7109211a419fe30e8d88d677cd60b6bc79c5732e0a"
 dependencies = [
  "addr2line",
  "cc",
@@ -63,9 +63,9 @@ checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 
 [[package]]
 name = "bitflags"
-version = "2.4.2"
+version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ed570934406eb16438a4e976b1b4500774099c13b8cb96eec99f620f05090ddf"
+checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
 
 [[package]]
 name = "block-buffer"
@@ -78,9 +78,9 @@ dependencies = [
 
 [[package]]
 name = "bumpalo"
-version = "3.15.3"
+version = "3.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8ea184aa71bb362a1157c896979544cc23974e08fd265f29ea96b59f0b4a555b"
+checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
 
 [[package]]
 name = "byteorder"
@@ -90,15 +90,15 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
 
 [[package]]
 name = "bytes"
-version = "1.5.0"
+version = "1.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
+checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9"
 
 [[package]]
 name = "cc"
-version = "1.0.89"
+version = "1.0.101"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a0ba8f7aaa012f30d5b2861462f6708eccd49c3c39863fe083a308035f63d723"
+checksum = "ac367972e516d45567c7eafc73d24e1c193dcf200a8d94e9db7b3d38b349572d"
 
 [[package]]
 name = "cfg-if"
@@ -154,9 +154,9 @@ dependencies = [
 
 [[package]]
 name = "errno"
-version = "0.3.8"
+version = "0.3.9"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a258e46cdc063eb8519c00b9fc845fc47bcfca4130e2f08e88665ceda8474245"
+checksum = "534c5cf6194dfab3db3242765c03bbe257cf92f22b38f6bc0c58d59108a820ba"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
@@ -170,15 +170,9 @@ checksum = "4443176a9f2c162692bd3d352d745ef9413eec5782a80d8fd6f8a1ac692a07f7"
 
 [[package]]
 name = "fastrand"
-version = "2.0.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "25cbce373ec4653f1a01a31e8a5e5ec0c622dc27ff9c4e6606eefef5cbbed4a5"
-
-[[package]]
-name = "finl_unicode"
-version = "1.2.0"
+version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8fcfdc7a0362c9f4444381a9e697c79d435fe65b52a37466fc2c1184cee9edc6"
+checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a"
 
 [[package]]
 name = "foreign-types"
@@ -296,9 +290,9 @@ dependencies = [
 
 [[package]]
 name = "getrandom"
-version = "0.2.12"
+version = "0.2.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5"
+checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
 dependencies = [
  "cfg-if",
  "libc",
@@ -307,9 +301,9 @@ dependencies = [
 
 [[package]]
 name = "gimli"
-version = "0.28.1"
+version = "0.29.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4271d37baee1b8c7e4b708028c57d816cf9d2434acb33a549475f78c181f6253"
+checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd"
 
 [[package]]
 name = "hmac"
@@ -329,29 +323,23 @@ dependencies = [
  "wasm-bindgen",
 ]
 
-[[package]]
-name = "lazy_static"
-version = "1.4.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
-
 [[package]]
 name = "libc"
-version = "0.2.153"
+version = "0.2.155"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c198f91728a82281a64e1f4f9eeb25d82cb32a5de251c6bd1b5154d63a8e7bd"
+checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.13"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
 
 [[package]]
 name = "lock_api"
-version = "0.4.11"
+version = "0.4.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3c168f8615b12bc01f9c17e2eb0cc07dcae1940121185446edc3744920e8ef45"
+checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
 dependencies = [
  "autocfg",
  "scopeguard",
@@ -375,15 +363,15 @@ dependencies = [
 
 [[package]]
 name = "memchr"
-version = "2.7.1"
+version = "2.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "523dc4f511e55ab87b694dc30d0f820d60906ef06413f93d4d7a1385599cc149"
+checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
 
 [[package]]
 name = "miniz_oxide"
-version = "0.7.2"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7"
+checksum = "b8a240ddb74feaf34a79a7add65a741f3167852fba007066dcac1ca548d89c08"
 dependencies = [
  "adler",
 ]
@@ -401,11 +389,10 @@ dependencies = [
 
 [[package]]
 name = "native-tls"
-version = "0.2.11"
+version = "0.2.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "07226173c32f2926027b63cce4bcd8076c3552846cbe7925f3aaffeac0a3b92e"
+checksum = "a8614eb2c83d59d1c8cc974dd3f920198647674a0a035e1af1fa58707e317466"
 dependencies = [
- "lazy_static",
  "libc",
  "log",
  "openssl",
@@ -419,9 +406,9 @@ dependencies = [
 
 [[package]]
 name = "object"
-version = "0.32.2"
+version = "0.36.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a6a622008b6e321afc04970976f62ee297fdbaa6f95318ca343e3eebb9648441"
+checksum = "576dfe1fc8f9df304abb159d767a29d0476f7750fbf8aa7ad07816004a207434"
 dependencies = [
  "memchr",
 ]
@@ -438,7 +425,7 @@ version = "0.10.64"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95a0481286a310808298130d22dd1fef0fa571e05a8f44ec801801e84b216b1f"
 dependencies = [
- "bitflags 2.4.2",
+ "bitflags 2.6.0",
  "cfg-if",
  "foreign-types",
  "libc",
@@ -466,9 +453,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "openssl-sys"
-version = "0.9.101"
+version = "0.9.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dda2b0f344e78efc2facf7d195d098df0dd72151b26ab98da807afc26c198dff"
+checksum = "c597637d56fbc83893a35eb0dd04b2b8e7a50c91e64e9493e398b5df4fb45fa2"
 dependencies = [
  "cc",
  "libc",
@@ -478,9 +465,9 @@ dependencies = [
 
 [[package]]
 name = "parking_lot"
-version = "0.12.1"
+version = "0.12.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3742b2c103b9f06bc9fff0a37ff4912935851bee6d36f3c02bcc755bcfec228f"
+checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
 dependencies = [
  "lock_api",
  "parking_lot_core",
@@ -488,15 +475,15 @@ dependencies = [
 
 [[package]]
 name = "parking_lot_core"
-version = "0.9.9"
+version = "0.9.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c42a9226546d68acdd9c0a280d17ce19bfe27a46bf68784e4066115788d008e"
+checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
 dependencies = [
  "cfg-if",
  "libc",
- "redox_syscall",
+ "redox_syscall 0.5.2",
  "smallvec",
- "windows-targets 0.48.5",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -525,9 +512,9 @@ dependencies = [
 
 [[package]]
 name = "pin-project-lite"
-version = "0.2.13"
+version = "0.2.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "8afb450f006bf6385ca15ef45d71d2288452bc3683ce2e2cacc0d18e4be60b58"
+checksum = "bda66fc9667c18cb2758a2ac84d1167245054bcf85d5d1aaa6923f45801bdd02"
 
 [[package]]
 name = "pin-utils"
@@ -591,18 +578,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.78"
+version = "1.0.86"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
+checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77"
 dependencies = [
  "unicode-ident",
 ]
 
 [[package]]
 name = "quote"
-version = "1.0.35"
+version = "1.0.36"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "291ec9ab5efd934aaf503a6466c5d5251535d108ee747472c3977cc5acc868ef"
+checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
 dependencies = [
  "proc-macro2",
 ]
@@ -646,6 +633,15 @@ dependencies = [
  "bitflags 1.3.2",
 ]
 
+[[package]]
+name = "redox_syscall"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c82cf8cff14456045f55ec4241383baeff27af886adb72ffb2162f99911de0fd"
+dependencies = [
+ "bitflags 2.6.0",
+]
+
 [[package]]
 name = "rust-neon-example"
 version = "0.1.0"
@@ -658,17 +654,17 @@ dependencies = [
 
 [[package]]
 name = "rustc-demangle"
-version = "0.1.23"
+version = "0.1.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76"
+checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f"
 
 [[package]]
 name = "rustix"
-version = "0.38.31"
+version = "0.38.34"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6ea3e1a662af26cd7a3ba09c0297a31af215563ecf42817c98df621387f4e949"
+checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f"
 dependencies = [
- "bitflags 2.4.2",
+ "bitflags 2.6.0",
  "errno",
  "libc",
  "linux-raw-sys",
@@ -692,11 +688,11 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
 [[package]]
 name = "security-framework"
-version = "2.9.2"
+version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05b64fb303737d99b81884b2c63433e9ae28abebe5eb5045dcdd175dc2ecf4de"
+checksum = "c627723fd09706bacdb5cf41499e95098555af3c3c29d014dc3c458ef6be11c0"
 dependencies = [
- "bitflags 1.3.2",
+ "bitflags 2.6.0",
  "core-foundation",
  "core-foundation-sys",
  "libc",
@@ -705,9 +701,9 @@ dependencies = [
 
 [[package]]
 name = "security-framework-sys"
-version = "2.9.1"
+version = "2.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e932934257d3b408ed8f30db49d85ea163bfe74961f017f405b025af298f0c7a"
+checksum = "317936bbbd05227752583946b9e66d7ce3b489f84e11a94a510b4437fef407d7"
 dependencies = [
  "core-foundation-sys",
  "libc",
@@ -741,15 +737,15 @@ dependencies = [
 
 [[package]]
 name = "smallvec"
-version = "1.13.1"
+version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7"
+checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 
 [[package]]
 name = "socket2"
-version = "0.5.6"
+version = "0.5.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "05ffd9c0a93b7543e062e759284fcf5f5e3b098501104bfbdde4d404db792871"
+checksum = "ce305eb0b4296696835b71df73eb912e0f1ffd2556a501fcede6e0c50349191c"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
@@ -757,26 +753,26 @@ dependencies = [
 
 [[package]]
 name = "stringprep"
-version = "0.1.4"
+version = "0.1.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bb41d74e231a107a1b4ee36bd1214b11285b77768d2e3824aedafa988fd36ee6"
+checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1"
 dependencies = [
- "finl_unicode",
  "unicode-bidi",
  "unicode-normalization",
+ "unicode-properties",
 ]
 
 [[package]]
 name = "subtle"
-version = "2.5.0"
+version = "2.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "81cdd64d312baedb58e21336b31bc043b77e01cc99033ce76ef539f78e965ebc"
+checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292"
 
 [[package]]
 name = "syn"
-version = "2.0.52"
+version = "2.0.68"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
+checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -797,9 +793,9 @@ dependencies = [
 
 [[package]]
 name = "tinyvec"
-version = "1.6.0"
+version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87cc5ceb3875bb20c2890005a4e226a4651264a5c75edb2421b52861a0a0cb50"
+checksum = "c55115c6fbe2d2bef26eb09ad74bde02d8255476fc0c7b515ef09fbb35742d82"
 dependencies = [
  "tinyvec_macros",
 ]
@@ -812,9 +808,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
 
 [[package]]
 name = "tokio"
-version = "1.36.0"
+version = "1.38.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "61285f6515fa018fb2d1e46eb21223fff441ee8db5d0f1435e8ab4f5cdb80931"
+checksum = "ba4f4a02a7a80d6f274636f0aa95c7e383b912d41fe721a31f29e29698585a4a"
 dependencies = [
  "backtrace",
  "bytes",
@@ -828,9 +824,9 @@ dependencies = [
 
 [[package]]
 name = "tokio-macros"
-version = "2.2.0"
+version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5b8a1e28f2deaa14e508979454cb3a223b10b938b45af148bc0986de36f1923b"
+checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -875,35 +871,15 @@ dependencies = [
 
 [[package]]
 name = "tokio-util"
-version = "0.7.10"
+version = "0.7.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5419f34732d9eb6ee4c3578b7989078579b7f039cbbb9ca2c4da015749371e15"
+checksum = "9cf6b47b3771c49ac75ad09a6162f53ad4b8088b76ac60e8ec1455b31a189fe1"
 dependencies = [
  "bytes",
  "futures-core",
  "futures-sink",
  "pin-project-lite",
  "tokio",
- "tracing",
-]
-
-[[package]]
-name = "tracing"
-version = "0.1.40"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
-dependencies = [
- "pin-project-lite",
- "tracing-core",
-]
-
-[[package]]
-name = "tracing-core"
-version = "0.1.32"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
-dependencies = [
- "once_cell",
 ]
 
 [[package]]
@@ -933,6 +909,12 @@ dependencies = [
  "tinyvec",
 ]
 
+[[package]]
+name = "unicode-properties"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e4259d9d4425d9f0661581b804cb85fe66a4c631cadd8f490d1c13a35d5d9291"
+
 [[package]]
 name = "vcpkg"
 version = "0.2.15"
@@ -1023,11 +1005,11 @@ dependencies = [
 
 [[package]]
 name = "whoami"
-version = "1.5.0"
+version = "1.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0fec781d48b41f8163426ed18e8fc2864c12937df9ce54c88ede7bd47270893e"
+checksum = "a44ab49fad634e88f55bf8f9bb3abd2f27d7204172a112c7c9987e01c1c94ea9"
 dependencies = [
- "redox_syscall",
+ "redox_syscall 0.4.1",
  "wasite",
  "web-sys",
 ]
@@ -1047,7 +1029,7 @@ version = "0.52.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d"
 dependencies = [
- "windows-targets 0.52.4",
+ "windows-targets 0.52.5",
 ]
 
 [[package]]
@@ -1067,17 +1049,18 @@ dependencies = [
 
 [[package]]
 name = "windows-targets"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7dd37b7e5ab9018759f893a1952c9420d060016fc19a472b4bb20d1bdd694d1b"
+checksum = "6f0713a46559409d202e70e28227288446bf7841d3211583a4b53e3f6d96e7eb"
 dependencies = [
- "windows_aarch64_gnullvm 0.52.4",
- "windows_aarch64_msvc 0.52.4",
- "windows_i686_gnu 0.52.4",
- "windows_i686_msvc 0.52.4",
- "windows_x86_64_gnu 0.52.4",
- "windows_x86_64_gnullvm 0.52.4",
- "windows_x86_64_msvc 0.52.4",
+ "windows_aarch64_gnullvm 0.52.5",
+ "windows_aarch64_msvc 0.52.5",
+ "windows_i686_gnu 0.52.5",
+ "windows_i686_gnullvm",
+ "windows_i686_msvc 0.52.5",
+ "windows_x86_64_gnu 0.52.5",
+ "windows_x86_64_gnullvm 0.52.5",
+ "windows_x86_64_msvc 0.52.5",
 ]
 
 [[package]]
@@ -1088,9 +1071,9 @@ checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8"
 
 [[package]]
 name = "windows_aarch64_gnullvm"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bcf46cf4c365c6f2d1cc93ce535f2c8b244591df96ceee75d8e83deb70a9cac9"
+checksum = "7088eed71e8b8dda258ecc8bac5fb1153c5cffaf2578fc8ff5d61e23578d3263"
 
 [[package]]
 name = "windows_aarch64_msvc"
@@ -1100,9 +1083,9 @@ checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc"
 
 [[package]]
 name = "windows_aarch64_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "da9f259dd3bcf6990b55bffd094c4f7235817ba4ceebde8e6d11cd0c5633b675"
+checksum = "9985fd1504e250c615ca5f281c3f7a6da76213ebd5ccc9561496568a2752afb6"
 
 [[package]]
 name = "windows_i686_gnu"
@@ -1112,9 +1095,15 @@ checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e"
 
 [[package]]
 name = "windows_i686_gnu"
-version = "0.52.4"
+version = "0.52.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "88ba073cf16d5372720ec942a8ccbf61626074c6d4dd2e745299726ce8b89670"
+
+[[package]]
+name = "windows_i686_gnullvm"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b474d8268f99e0995f25b9f095bc7434632601028cf86590aea5c8a5cb7801d3"
+checksum = "87f4261229030a858f36b459e748ae97545d6f1ec60e5e0d6a3d32e0dc232ee9"
 
 [[package]]
 name = "windows_i686_msvc"
@@ -1124,9 +1113,9 @@ checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406"
 
 [[package]]
 name = "windows_i686_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1515e9a29e5bed743cb4415a9ecf5dfca648ce85ee42e15873c3cd8610ff8e02"
+checksum = "db3c2bf3d13d5b658be73463284eaf12830ac9a26a90c717b7f771dfe97487bf"
 
 [[package]]
 name = "windows_x86_64_gnu"
@@ -1136,9 +1125,9 @@ checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e"
 
 [[package]]
 name = "windows_x86_64_gnu"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5eee091590e89cc02ad514ffe3ead9eb6b660aedca2183455434b93546371a03"
+checksum = "4e4246f76bdeff09eb48875a0fd3e2af6aada79d409d33011886d3e1581517d9"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
@@ -1148,9 +1137,9 @@ checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc"
 
 [[package]]
 name = "windows_x86_64_gnullvm"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "77ca79f2451b49fa9e2af39f0747fe999fcda4f5e241b2898624dca97a1f2177"
+checksum = "852298e482cd67c356ddd9570386e2862b5673c85bd5f88df9ab6802b334c596"
 
 [[package]]
 name = "windows_x86_64_msvc"
@@ -1160,6 +1149,6 @@ checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538"
 
 [[package]]
 name = "windows_x86_64_msvc"
-version = "0.52.4"
+version = "0.52.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "32b752e52a2da0ddfbdbcc6fceadfeede4c939ed16d13e648833a61dfb611ed8"
+checksum = "bec47e5bfd1bff0eeaf6d8b485cc1074891a197ab4225d504cb7a1ab88b02bf0"
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
index 0f420e5b0643..27d01810bd52 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
+++ b/test_runner/pg_clients/rust/tokio-postgres/Cargo.toml
@@ -7,9 +7,9 @@ publish = false
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-native-tls = "0.2.11"
+native-tls = "0.2.12"
 postgres-native-tls = "0.5.0"
-tokio = { version = "1.36", features=["rt", "macros"] }
+tokio = { version = "1.38", features=["rt", "macros"] }
 tokio-postgres = "0.7.10"
 
 
diff --git a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
index 8611e66cbb67..3e214de785b3 100644
--- a/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
+++ b/test_runner/pg_clients/rust/tokio-postgres/Dockerfile
@@ -1,4 +1,4 @@
-FROM rust:1.76
+FROM rust:1.79
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
index 04028388207c..6006e61ee22e 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Dockerfile
@@ -1,11 +1,11 @@
-FROM swift:5.9 AS build
+FROM swift:5.10 AS build
 RUN apt-get -q update && apt-get -q install -y libssl-dev
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.9
+FROM swift:5.10
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresClientKitExample"]
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved
index 767443a9ddcc..6e8613095f15 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.resolved
@@ -1,4 +1,5 @@
 {
+  "originHash" : "8eff8c577ba246ce7824d3434839acefced2b1a1d2b1ad700554502538a50558",
   "pins" : [
     {
       "identity" : "bluesocket",
@@ -18,15 +19,6 @@
         "version" : "2.0.2"
       }
     },
-    {
-      "identity" : "openssl",
-      "kind" : "remoteSourceControl",
-      "location" : "https://github.com/Kitura/OpenSSL.git",
-      "state" : {
-        "revision" : "5dc8cb4f971135c17343e3c6df4f28904a0600e2",
-        "version" : "2.3.1"
-      }
-    },
     {
       "identity" : "postgresclientkit",
       "kind" : "remoteSourceControl",
@@ -37,5 +29,5 @@
       }
     }
   ],
-  "version" : 2
+  "version" : 3
 }
diff --git a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift
index 48320dd02314..a66d09c542f9 100644
--- a/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift
+++ b/test_runner/pg_clients/swift/PostgresClientKitExample/Package.swift
@@ -1,4 +1,4 @@
-// swift-tools-version:5.8
+// swift-tools-version:5.10
 import PackageDescription
 
 let package = Package(
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
index 9130e0973f8e..d6815fbb5fa2 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Dockerfile
@@ -1,10 +1,10 @@
-FROM swift:5.9 AS build
+FROM swift:5.10 AS build
 WORKDIR /source
 
 COPY . .
 RUN swift build --configuration release
 
-FROM swift:5.9
+FROM swift:5.10
 WORKDIR /app
 COPY --from=build /source/.build/release .
 CMD ["/app/PostgresNIOExample"]
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
index 023e03a7b1a0..0e5dfdafcb0a 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.resolved
@@ -1,12 +1,22 @@
 {
+  "originHash" : "11b5dcece349a3e56a7a9a7d0af6d0f5b83dff321b43124a01b158ed7aac5302",
   "pins" : [
     {
       "identity" : "postgres-nio",
       "kind" : "remoteSourceControl",
       "location" : "https://github.com/vapor/postgres-nio.git",
       "state" : {
-        "revision" : "69ccfdf4c80144d845e3b439961b7ec6cd7ae33f",
-        "version" : "1.20.2"
+        "revision" : "5c268768890b062803a49f1358becc478f954265",
+        "version" : "1.21.5"
+      }
+    },
+    {
+      "identity" : "swift-async-algorithms",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/apple/swift-async-algorithms.git",
+      "state" : {
+        "revision" : "da4e36f86544cdf733a40d59b3a2267e3a7bbf36",
+        "version" : "1.0.0"
       }
     },
     {
@@ -81,6 +91,15 @@
         "version" : "1.20.1"
       }
     },
+    {
+      "identity" : "swift-service-lifecycle",
+      "kind" : "remoteSourceControl",
+      "location" : "https://github.com/swift-server/swift-service-lifecycle.git",
+      "state" : {
+        "revision" : "d58e6bf2b1ae2884cf204a8b5bcaaa7aae3c1ff0",
+        "version" : "2.6.0"
+      }
+    },
     {
       "identity" : "swift-system",
       "kind" : "remoteSourceControl",
@@ -91,5 +110,5 @@
       }
     }
   ],
-  "version" : 2
+  "version" : 3
 }
diff --git a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
index 637eb4bc9ddb..20bb10f76c37 100644
--- a/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
+++ b/test_runner/pg_clients/swift/PostgresNIOExample/Package.swift
@@ -1,10 +1,10 @@
-// swift-tools-version:5.9
+// swift-tools-version:5.10
 import PackageDescription
 
 let package = Package(
     name: "PostgresNIOExample",
     dependencies: [
-        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.20.2")
+        .package(url: "https://github.com/vapor/postgres-nio.git", from: "1.21.5")
     ],
     targets: [
         .executableTarget(
diff --git a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
index 004b383749f9..45e8753f7eec 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
+++ b/test_runner/pg_clients/typescript/postgresql-client/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:21
+FROM node:22
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
index b4f8587eacef..19311808b6b8 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package-lock.json
@@ -5,7 +5,7 @@
   "packages": {
     "": {
       "dependencies": {
-        "postgresql-client": "2.10.5"
+        "postgresql-client": "2.11.0"
       }
     },
     "node_modules/doublylinked": {
@@ -42,9 +42,10 @@
       }
     },
     "node_modules/postgresql-client": {
-      "version": "2.10.5",
-      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.10.5.tgz",
-      "integrity": "sha512-R3EC16pUdbgrzk1J2MQLj7jY2TepWurJHoK90nOeLZj1XTpL/+wL1VCneTmclRVKDuKVjFHr+FASV47KrLpAbw==",
+      "version": "2.11.0",
+      "resolved": "https://registry.npmjs.org/postgresql-client/-/postgresql-client-2.11.0.tgz",
+      "integrity": "sha512-QSPHcWVaiBG+JyASaDojOXvhRmsc2n8j2COdIjUDENFAtFls16Zy240asY2ENzZRQJUMAA8vpR8w4SAdI8jdbw==",
+      "license": "MIT",
       "dependencies": {
         "doublylinked": "^2.5.4",
         "lightning-pool": "^4.2.2",
@@ -55,8 +56,7 @@
         "putil-varhelpers": "^1.6.5"
       },
       "engines": {
-        "node": ">=16.0",
-        "npm": ">=7.0.0"
+        "node": ">=16.0"
       }
     },
     "node_modules/power-tasks": {
diff --git a/test_runner/pg_clients/typescript/postgresql-client/package.json b/test_runner/pg_clients/typescript/postgresql-client/package.json
index 07ec100d0d22..d2bba23d2912 100644
--- a/test_runner/pg_clients/typescript/postgresql-client/package.json
+++ b/test_runner/pg_clients/typescript/postgresql-client/package.json
@@ -1,6 +1,6 @@
 {
   "type": "module",
   "dependencies": {
-    "postgresql-client": "2.10.5"
+    "postgresql-client": "2.11.0"
   }
 }
diff --git a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
index 004b383749f9..45e8753f7eec 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
+++ b/test_runner/pg_clients/typescript/serverless-driver/Dockerfile
@@ -1,4 +1,4 @@
-FROM node:21
+FROM node:22
 WORKDIR /source
 
 COPY . .
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
index f3b456f1edc7..7f3f7f2e84e7 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package-lock.json
@@ -5,96 +5,138 @@
   "packages": {
     "": {
       "dependencies": {
-        "@neondatabase/serverless": "0.9.0",
+        "@neondatabase/serverless": "0.9.4",
         "ws": "8.17.1"
       }
     },
     "node_modules/@neondatabase/serverless": {
-      "version": "0.9.0",
-      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.0.tgz",
-      "integrity": "sha512-mmJnUAzlzvxNSZuuhI6kgJjH+JgFdBMYUWxihtq/nj0Tjt+Y5UU3W+SvRFoucnd5NObYkuLYQzk+zV5DGFKGJg==",
+      "version": "0.9.4",
+      "resolved": "https://registry.npmjs.org/@neondatabase/serverless/-/serverless-0.9.4.tgz",
+      "integrity": "sha512-D0AXgJh6xkf+XTlsO7iwE2Q1w8981E1cLCPAALMU2YKtkF/1SF6BiAzYARZFYo175ON+b1RNIy9TdSFHm5nteg==",
+      "license": "MIT",
       "dependencies": {
-        "@types/pg": "8.6.6"
+        "@types/pg": "8.11.6"
       }
     },
     "node_modules/@types/node": {
-      "version": "18.16.3",
-      "resolved": "https://registry.npmjs.org/@types/node/-/node-18.16.3.tgz",
-      "integrity": "sha512-OPs5WnnT1xkCBiuQrZA4+YAV4HEJejmHneyraIaxsbev5yCEr6KMwINNFP9wQeFIw8FWcoTqF3vQsa5CDaI+8Q=="
+      "version": "20.14.9",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.9.tgz",
+      "integrity": "sha512-06OCtnTXtWOZBJlRApleWndH4JsRVs1pDCc8dLSQp+7PpUpX3ePdHyeNSFTeSe7FtKyQkrlPvHwJOW3SLd8Oyg==",
+      "license": "MIT",
+      "dependencies": {
+        "undici-types": "~5.26.4"
+      }
     },
     "node_modules/@types/pg": {
-      "version": "8.6.6",
-      "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.6.6.tgz",
-      "integrity": "sha512-O2xNmXebtwVekJDD+02udOncjVcMZQuTEQEMpKJ0ZRf5E7/9JJX3izhKUcUifBkyKpljyUM6BTgy2trmviKlpw==",
+      "version": "8.11.6",
+      "resolved": "https://registry.npmjs.org/@types/pg/-/pg-8.11.6.tgz",
+      "integrity": "sha512-/2WmmBXHLsfRqzfHW7BNZ8SbYzE8OSk7i3WjFYvfgRHj7S1xj+16Je5fUKv3lVdVzk/zn9TXOqf+avFCFIE0yQ==",
+      "license": "MIT",
       "dependencies": {
         "@types/node": "*",
         "pg-protocol": "*",
-        "pg-types": "^2.2.0"
+        "pg-types": "^4.0.1"
       }
     },
+    "node_modules/obuf": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/obuf/-/obuf-1.1.2.tgz",
+      "integrity": "sha512-PX1wu0AmAdPqOL1mWhqmlOd8kOIZQwGZw6rh7uby9fTc5lhaOWFLX3I6R1hrF9k3zUY40e6igsLGkDXK92LJNg==",
+      "license": "MIT"
+    },
     "node_modules/pg-int8": {
       "version": "1.0.1",
       "resolved": "https://registry.npmjs.org/pg-int8/-/pg-int8-1.0.1.tgz",
       "integrity": "sha512-WCtabS6t3c8SkpDBUlb1kjOs7l66xsGdKpIPZsg4wR+B3+u9UAum2odSsF9tnvxg80h4ZxLWMy4pRjOsFIqQpw==",
+      "license": "ISC",
       "engines": {
         "node": ">=4.0.0"
       }
     },
+    "node_modules/pg-numeric": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/pg-numeric/-/pg-numeric-1.0.2.tgz",
+      "integrity": "sha512-BM/Thnrw5jm2kKLE5uJkXqqExRUY/toLHda65XgFTBTFYZyopbKjBe29Ii3RbkvlsMoFwD+tHeGaCjjv0gHlyw==",
+      "license": "ISC",
+      "engines": {
+        "node": ">=4"
+      }
+    },
     "node_modules/pg-protocol": {
-      "version": "1.6.0",
-      "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.0.tgz",
-      "integrity": "sha512-M+PDm637OY5WM307051+bsDia5Xej6d9IR4GwJse1qA1DIhiKlksvrneZOYQq42OM+spubpcNYEo2FcKQrDk+Q=="
+      "version": "1.6.1",
+      "resolved": "https://registry.npmjs.org/pg-protocol/-/pg-protocol-1.6.1.tgz",
+      "integrity": "sha512-jPIlvgoD63hrEuihvIg+tJhoGjUsLPn6poJY9N5CnlPd91c2T18T/9zBtLxZSb1EhYxBRoZJtzScCaWlYLtktg==",
+      "license": "MIT"
     },
     "node_modules/pg-types": {
-      "version": "2.2.0",
-      "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-2.2.0.tgz",
-      "integrity": "sha512-qTAAlrEsl8s4OiEQY69wDvcMIdQN6wdz5ojQiOy6YRMuynxenON0O5oCpJI6lshc6scgAY8qvJ2On/p+CXY0GA==",
+      "version": "4.0.2",
+      "resolved": "https://registry.npmjs.org/pg-types/-/pg-types-4.0.2.tgz",
+      "integrity": "sha512-cRL3JpS3lKMGsKaWndugWQoLOCoP+Cic8oseVcbr0qhPzYD5DWXK+RZ9LY9wxRf7RQia4SCwQlXk0q6FCPrVng==",
+      "license": "MIT",
       "dependencies": {
         "pg-int8": "1.0.1",
-        "postgres-array": "~2.0.0",
-        "postgres-bytea": "~1.0.0",
-        "postgres-date": "~1.0.4",
-        "postgres-interval": "^1.1.0"
+        "pg-numeric": "1.0.2",
+        "postgres-array": "~3.0.1",
+        "postgres-bytea": "~3.0.0",
+        "postgres-date": "~2.1.0",
+        "postgres-interval": "^3.0.0",
+        "postgres-range": "^1.1.1"
       },
       "engines": {
-        "node": ">=4"
+        "node": ">=10"
       }
     },
     "node_modules/postgres-array": {
-      "version": "2.0.0",
-      "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-2.0.0.tgz",
-      "integrity": "sha512-VpZrUqU5A69eQyW2c5CA1jtLecCsN2U/bD6VilrFDWq5+5UIEVO7nazS3TEcHf1zuPYO/sqGvUvW62g86RXZuA==",
+      "version": "3.0.2",
+      "resolved": "https://registry.npmjs.org/postgres-array/-/postgres-array-3.0.2.tgz",
+      "integrity": "sha512-6faShkdFugNQCLwucjPcY5ARoW1SlbnrZjmGl0IrrqewpvxvhSLHimCVzqeuULCbG0fQv7Dtk1yDbG3xv7Veog==",
+      "license": "MIT",
       "engines": {
-        "node": ">=4"
+        "node": ">=12"
       }
     },
     "node_modules/postgres-bytea": {
-      "version": "1.0.0",
-      "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-1.0.0.tgz",
-      "integrity": "sha512-xy3pmLuQqRBZBXDULy7KbaitYqLcmxigw14Q5sj8QBVLqEwXfeybIKVWiqAXTlcvdvb0+xkOtDbfQMOf4lST1w==",
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/postgres-bytea/-/postgres-bytea-3.0.0.tgz",
+      "integrity": "sha512-CNd4jim9RFPkObHSjVHlVrxoVQXz7quwNFpz7RY1okNNme49+sVyiTvTRobiLV548Hx/hb1BG+iE7h9493WzFw==",
+      "license": "MIT",
+      "dependencies": {
+        "obuf": "~1.1.2"
+      },
       "engines": {
-        "node": ">=0.10.0"
+        "node": ">= 6"
       }
     },
     "node_modules/postgres-date": {
-      "version": "1.0.7",
-      "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-1.0.7.tgz",
-      "integrity": "sha512-suDmjLVQg78nMK2UZ454hAG+OAW+HQPZ6n++TNDUX+L0+uUlLywnoxJKDou51Zm+zTCjrCl0Nq6J9C5hP9vK/Q==",
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/postgres-date/-/postgres-date-2.1.0.tgz",
+      "integrity": "sha512-K7Juri8gtgXVcDfZttFKVmhglp7epKb1K4pgrkLxehjqkrgPhfG6OO8LHLkfaqkbpjNRnra018XwAr1yQFWGcA==",
+      "license": "MIT",
       "engines": {
-        "node": ">=0.10.0"
+        "node": ">=12"
       }
     },
     "node_modules/postgres-interval": {
-      "version": "1.2.0",
-      "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-1.2.0.tgz",
-      "integrity": "sha512-9ZhXKM/rw350N1ovuWHbGxnGh/SNJ4cnxHiM0rxE4VN41wsg8P8zWn9hv/buK00RP4WvlOyr/RBDiptyxVbkZQ==",
-      "dependencies": {
-        "xtend": "^4.0.0"
-      },
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/postgres-interval/-/postgres-interval-3.0.0.tgz",
+      "integrity": "sha512-BSNDnbyZCXSxgA+1f5UU2GmwhoI0aU5yMxRGO8CdFEcY2BQF9xm/7MqKnYoM1nJDk8nONNWDk9WeSmePFhQdlw==",
+      "license": "MIT",
       "engines": {
-        "node": ">=0.10.0"
+        "node": ">=12"
       }
     },
+    "node_modules/postgres-range": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/postgres-range/-/postgres-range-1.1.4.tgz",
+      "integrity": "sha512-i/hbxIE9803Alj/6ytL7UHQxRvZkI9O4Sy+J3HGc4F4oo/2eQAjTSNJ0bfxyse3bH0nuVesCk+3IRLaMtG3H6w==",
+      "license": "MIT"
+    },
+    "node_modules/undici-types": {
+      "version": "5.26.5",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz",
+      "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==",
+      "license": "MIT"
+    },
     "node_modules/ws": {
       "version": "8.17.1",
       "resolved": "https://registry.npmjs.org/ws/-/ws-8.17.1.tgz",
@@ -114,14 +156,6 @@
           "optional": true
         }
       }
-    },
-    "node_modules/xtend": {
-      "version": "4.0.2",
-      "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz",
-      "integrity": "sha512-LKYU1iAXJXUgAXn9URjiu+MWhyUXHsvfp7mcuYm9dSUKK0/CjtrUwFAxD82/mCWbtLsGjFIad0wIsod4zrTAEQ==",
-      "engines": {
-        "node": ">=0.4"
-      }
     }
   }
 }
diff --git a/test_runner/pg_clients/typescript/serverless-driver/package.json b/test_runner/pg_clients/typescript/serverless-driver/package.json
index 3ae7a8a6cfcd..f791d184c5f8 100644
--- a/test_runner/pg_clients/typescript/serverless-driver/package.json
+++ b/test_runner/pg_clients/typescript/serverless-driver/package.json
@@ -1,7 +1,7 @@
 {
   "type": "module",
   "dependencies": {
-    "@neondatabase/serverless": "0.9.0",
+    "@neondatabase/serverless": "0.9.4",
     "ws": "8.17.1"
   }
 }
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index e117c2140f5e..f2ee2b70aac6 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -21,8 +21,6 @@ def positive_env(neon_env_builder: NeonEnvBuilder) -> NeonEnv:
         [
             # eviction might be the first one after an attach to access the layers
             ".*unexpectedly on-demand downloading remote layer .* for task kind Eviction",
-            # detach can happen before we get to validate the generation number
-            ".*deletion backend: Dropped remote consistent LSN updates for tenant.*",
         ]
     )
     assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
@@ -58,10 +56,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
 
     env.pageserver.allowed_errors.extend(
         [
-            # This fixture detaches the tenant, and tests using it will tend to re-attach it
-            # shortly after. There may be un-processed deletion_queue validations from the
-            # initial attachment
-            ".*Dropped remote consistent LSN updates.*",
             # This fixture is for tests that will intentionally generate 400 responses
             ".*Error processing HTTP request: Bad request",
         ]
diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py
index 035ab2796f6a..922a21a99929 100644
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -211,7 +211,7 @@ def op():
     def check_pageserver(expect_success: bool, **conn_kwargs):
         check_connection(
             env.pageserver,
-            f"get_last_record_rlsn {env.initial_tenant} {timeline_id}",
+            f"show {env.initial_tenant}",
             expect_success,
             **conn_kwargs,
         )
diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py
index 97ab69049d00..4d2cdb8e320a 100644
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -14,11 +14,6 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
     )
     env = neon_env_builder.init_start()
 
-    for pageserver in env.pageservers:
-        # This test dual-attaches a tenant, one of the pageservers will therefore
-        # be running with a stale generation.
-        pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     env.neon_cli.create_branch("test_change_pageserver")
     endpoint = env.endpoints.create_start("test_change_pageserver")
 
diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py
new file mode 100644
index 000000000000..ae3dded437a0
--- /dev/null
+++ b/test_runner/regress/test_endpoint_crash.py
@@ -0,0 +1,23 @@
+import pytest
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+@pytest.mark.parametrize(
+    "sql_func",
+    [
+        "trigger_panic",
+        "trigger_segfault",
+        "💣",  # calls `trigger_segfault` internally
+    ],
+)
+def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str):
+    """
+    Test that triggering crash from neon_test_utils crashes the endpoint
+    """
+    env = neon_env_builder.init_start()
+    env.neon_cli.create_branch("test_endpoint_crash")
+    endpoint = env.endpoints.create_start("test_endpoint_crash")
+
+    endpoint.safe_psql("CREATE EXTENSION neon_test_utils;")
+    with pytest.raises(Exception, match="This probably means the server terminated abnormally"):
+        endpoint.safe_psql(f"SELECT {sql_func}();")
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index d97e882a7093..4dae9176b83f 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -88,7 +88,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
 
     env.pageserver.allowed_errors.extend(
         [
-            ".*error importing base backup .*",
+            ".*Failed to import basebackup.*",
+            ".*unexpected non-zero bytes after the tar archive.*",
             ".*Timeline got dropped without initializing, cleaning its files.*",
             ".*InternalServerError.*timeline not found.*",
             ".*InternalServerError.*Tenant .* not found.*",
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 54d3b2d515c5..3b2218dd9b09 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -39,9 +39,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
 
     env = neon_env_builder.init_configs()
     env.start()
-    env.pageserver.allowed_errors.extend(
-        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
-    )
 
     ps_http = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_lfc_working_set_approximation.py b/test_runner/regress/test_lfc_working_set_approximation.py
index a6f05fe0f712..6465bdfd217d 100644
--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -1,3 +1,4 @@
+import time
 from pathlib import Path
 
 from fixtures.log_helper import log
@@ -72,3 +73,46 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
     blocks = query_scalar(cur, "select approximate_working_set_size(true)")
     log.info(f"working set size after some index access of a few select pages only {blocks}")
     assert blocks < 10
+
+
+def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start(
+        branch_name="main",
+        config_lines=[
+            "autovacuum = off",
+            "shared_buffers=1MB",
+            "neon.max_file_cache_size=256MB",
+            "neon.file_cache_size_limit=245MB",
+        ],
+    )
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    cur.execute("create extension neon version '1.4'")
+    cur.execute(
+        "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
+    )
+    cur.execute("insert into t (pk) values (generate_series(1,1000000))")
+    time.sleep(2)
+    before_10k = time.monotonic()
+    cur.execute("select sum(count) from t where pk between 10000 and 20000")
+    time.sleep(2)
+    before_1k = time.monotonic()
+    cur.execute("select sum(count) from t where pk between 1000 and 2000")
+    after = time.monotonic()
+
+    cur.execute(f"select approximate_working_set_size_seconds({int(after - before_1k + 1)})")
+    estimation_1k = cur.fetchall()[0][0]
+    log.info(f"Working set size for selecting 1k records {estimation_1k}")
+
+    cur.execute(f"select approximate_working_set_size_seconds({int(after - before_10k + 1)})")
+    estimation_10k = cur.fetchall()[0][0]
+    log.info(f"Working set size for selecting 10k records {estimation_10k}")
+
+    cur.execute("select pg_table_size('t')")
+    size = cur.fetchall()[0][0] // 8192
+    log.info(f"Table size {size} blocks")
+
+    assert estimation_1k >= 20 and estimation_1k <= 40
+    assert estimation_10k >= 200 and estimation_10k <= 400
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index ca3c81d6e51d..41283e4d2ca0 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -4,7 +4,6 @@
 from string import ascii_lowercase
 
 import pytest
-from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     AuxFileStore,
@@ -13,7 +12,7 @@
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
-from fixtures.utils import query_scalar, wait_until
+from fixtures.utils import wait_until
 
 
 def random_string(n: int):
@@ -326,12 +325,17 @@ def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg):
         assert "could not receive data from WAL stream" not in logs
 
 
-# Test compute start at LSN page of which starts with contrecord
-# https://github.com/neondatabase/neon/issues/5749
+# Test replication of WAL record spanning page boundary (with contrecord) after
+# compute restart and WAL write of the page.
+#
+# See https://github.com/neondatabase/neon/issues/5749
+#
+# Most pages start with a contrecord, so we don't do anything special
+# to ensure that.
 @pytest.mark.parametrize(
     "pageserver_aux_file_policy", [AuxFileStore.V1, AuxFileStore.CrossValidation]
 )
-def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
+def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
     env.neon_cli.create_branch("init")
@@ -356,52 +360,6 @@ def test_wal_page_boundary_start(neon_simple_env: NeonEnv, vanilla_pg):
     logical_replication_sync(vanilla_pg, endpoint)
     vanilla_pg.stop()
 
-    with endpoint.cursor() as cur:
-        # measure how much space logical message takes. Sometimes first attempt
-        # creates huge message and then it stabilizes, have no idea why.
-        for _ in range(3):
-            lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-            log.info(f"current_lsn={lsn_before}")
-            # Non-transactional logical message doesn't write WAL, only XLogInsert's
-            # it, so use transactional. Which is a bit problematic as transactional
-            # necessitates commit record. Alternatively we can do smth like
-            #   select neon_xlogflush(pg_current_wal_insert_lsn());
-            # but isn't much better + that particular call complains on 'xlog flush
-            # request 0/282C018 is not satisfied' as pg_current_wal_insert_lsn skips
-            # page headers.
-            payload = "blahblah"
-            cur.execute(f"select pg_logical_emit_message(true, 'pref', '{payload}')")
-            lsn_after_by_curr_wal_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-            lsn_diff = lsn_after_by_curr_wal_lsn - lsn_before
-            logical_message_base = lsn_after_by_curr_wal_lsn - lsn_before - len(payload)
-            log.info(
-                f"before {lsn_before}, after {lsn_after_by_curr_wal_lsn}, lsn diff is {lsn_diff}, base {logical_message_base}"
-            )
-
-        # and write logical message spanning exactly as we want
-        lsn_before = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        log.info(f"current_lsn={lsn_before}")
-        curr_lsn = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        offs = int(curr_lsn) % 8192
-        till_page = 8192 - offs
-        payload_len = (
-            till_page - logical_message_base - 8
-        )  # not sure why 8 is here, it is deduced from experiments
-        log.info(f"current_lsn={curr_lsn}, offs {offs}, till_page {till_page}")
-
-        # payload_len above would go exactly till the page boundary; but we want contrecord, so make it slightly longer
-        payload_len += 8
-
-        cur.execute(f"select pg_logical_emit_message(true, 'pref', 'f{'a' * payload_len}')")
-        supposedly_contrecord_end = Lsn(query_scalar(cur, "select pg_current_wal_lsn()"))
-        log.info(f"supposedly_page_boundary={supposedly_contrecord_end}")
-        # The calculations to hit the page boundary are very fuzzy, so just
-        # ignore test if we fail to reach it.
-        if not (int(supposedly_contrecord_end) % 8192 == 32):
-            pytest.skip("missed page boundary, bad luck")
-
-        cur.execute("insert into replication_example values (2, 3)")
-
     wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
     endpoint.stop().start()
 
diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py
index 39b486502672..e83aaf91c60f 100644
--- a/test_runner/regress/test_neon_extension.py
+++ b/test_runner/regress/test_neon_extension.py
@@ -50,7 +50,7 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder):
             # Ensure that the default version is also updated in the neon.control file
             assert cur.fetchone() == ("1.3",)
             cur.execute("SELECT * from neon.NEON_STAT_FILE_CACHE")
-            all_versions = ["1.3", "1.2", "1.1", "1.0"]
+            all_versions = ["1.4", "1.3", "1.2", "1.1", "1.0"]
             current_version = "1.3"
             for idx, begin_version in enumerate(all_versions):
                 for target_version in all_versions[idx + 1 :]:
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index b9e7e642b51c..51e847135efd 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -7,6 +7,7 @@
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
+    VanillaPostgres,
     import_timeline_from_vanilla_postgres,
     wait_for_wal_insert_lsn,
 )
@@ -182,3 +183,275 @@ def test_import_at_2bil(
     cur = conn.cursor()
     cur.execute("SELECT count(*) from t")
     assert cur.fetchone() == (10000 + 1 + 1,)
+
+
+# Constants and macros copied from PostgreSQL multixact.c and headers. These are needed to
+# calculate the SLRU segments that a particular multixid or multixid-offsets falls into.
+BLCKSZ = 8192
+MULTIXACT_OFFSETS_PER_PAGE = int(BLCKSZ / 4)
+SLRU_PAGES_PER_SEGMENT = int(32)
+MXACT_MEMBER_BITS_PER_XACT = 8
+MXACT_MEMBER_FLAGS_PER_BYTE = 1
+MULTIXACT_FLAGBYTES_PER_GROUP = 4
+MULTIXACT_MEMBERS_PER_MEMBERGROUP = MULTIXACT_FLAGBYTES_PER_GROUP * MXACT_MEMBER_FLAGS_PER_BYTE
+MULTIXACT_MEMBERGROUP_SIZE = 4 * MULTIXACT_MEMBERS_PER_MEMBERGROUP + MULTIXACT_FLAGBYTES_PER_GROUP
+MULTIXACT_MEMBERGROUPS_PER_PAGE = int(BLCKSZ / MULTIXACT_MEMBERGROUP_SIZE)
+MULTIXACT_MEMBERS_PER_PAGE = MULTIXACT_MEMBERGROUPS_PER_PAGE * MULTIXACT_MEMBERS_PER_MEMBERGROUP
+
+
+def MultiXactIdToOffsetSegment(xid: int):
+    return int(xid / (SLRU_PAGES_PER_SEGMENT * MULTIXACT_OFFSETS_PER_PAGE))
+
+
+def MXOffsetToMemberSegment(off: int):
+    return int(off / (SLRU_PAGES_PER_SEGMENT * MULTIXACT_MEMBERS_PER_PAGE))
+
+
+def advance_multixid_to(
+    pg_bin: PgBin, vanilla_pg: VanillaPostgres, next_multi_xid: int, next_multi_offset: int
+):
+    """
+    Use pg_resetwal to advance the nextMulti and nextMultiOffset values in a stand-alone
+    Postgres cluster. This is useful to get close to wraparound or some other interesting
+    value, without having to burn a lot of time consuming the (multi-)XIDs one by one.
+
+    The new values should be higher than the old ones, in a wraparound-aware sense.
+
+    On entry, the server should be running. It will be shut down and restarted.
+    """
+
+    # Read old values from the last checkpoint. We will pass the old oldestMultiXid value
+    # back to pg_resetwal, there's no option to leave it alone.
+    with vanilla_pg.connect() as conn:
+        with conn.cursor() as cur:
+            # Make sure the oldest-multi-xid value in the control file is up-to-date
+            cur.execute("checkpoint")
+            cur.execute("select oldest_multi_xid, next_multixact_id from pg_control_checkpoint()")
+            rec = cur.fetchone()
+            assert rec is not None
+            (ckpt_oldest_multi_xid, ckpt_next_multi_xid) = rec
+    log.info(f"oldestMultiXid was {ckpt_oldest_multi_xid}, nextMultiXid was {ckpt_next_multi_xid}")
+    log.info(f"Resetting to {next_multi_xid}")
+
+    # Use pg_resetwal to reset the next multiXid and multiOffset to given values.
+    vanilla_pg.stop()
+    pg_resetwal_path = os.path.join(pg_bin.pg_bin_path, "pg_resetwal")
+    cmd = [
+        pg_resetwal_path,
+        f"--multixact-ids={next_multi_xid},{ckpt_oldest_multi_xid}",
+        f"--multixact-offset={next_multi_offset}",
+        "-D",
+        str(vanilla_pg.pgdatadir),
+    ]
+    pg_bin.run_capture(cmd)
+
+    # Because we skip over a lot of values, Postgres hasn't created the SLRU segments for
+    # the new values yet. Create them manually, to allow Postgres to start up.
+    #
+    # This leaves "gaps" in the SLRU where segments between old value and new value are
+    # missing. That's OK for our purposes. Autovacuum will print some warnings about the
+    # missing segments, but will clean it up by truncating the SLRUs up to the new value,
+    # closing the gap.
+    segname = "%04X" % MultiXactIdToOffsetSegment(next_multi_xid)
+    log.info(f"Creating dummy segment pg_multixact/offsets/{segname}")
+    with open(vanilla_pg.pgdatadir / "pg_multixact" / "offsets" / segname, "w") as of:
+        of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ)
+        of.flush()
+
+    segname = "%04X" % MXOffsetToMemberSegment(next_multi_offset)
+    log.info(f"Creating dummy segment pg_multixact/members/{segname}")
+    with open(vanilla_pg.pgdatadir / "pg_multixact" / "members" / segname, "w") as of:
+        of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ)
+        of.flush()
+
+    # Start Postgres again and wait until autovacuum has processed all the databases
+    #
+    # This allows truncating the SLRUs, fixing the gaps with missing segments.
+    vanilla_pg.start()
+    with vanilla_pg.connect().cursor() as cur:
+        for _ in range(1000):
+            datminmxid = int(
+                query_scalar(cur, "select min(datminmxid::text::int8) from pg_database")
+            )
+            log.info(f"datminmxid {datminmxid}")
+            if next_multi_xid - datminmxid < 1_000_000:  # not wraparound-aware!
+                break
+            time.sleep(0.5)
+
+
+def test_multixid_wraparound_import(
+    neon_env_builder: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_bin: PgBin,
+    vanilla_pg,
+):
+    """
+    Test that the wraparound of the "next-multi-xid" counter is handled correctly in
+    pageserver, And multi-offsets as well
+    """
+    env = neon_env_builder.init_start()
+
+    # In order to to test multixid wraparound, we need to first advance the counter to
+    # within spitting distance of the wraparound, that is 2^32 multi-XIDs. We could simply
+    # run a workload that consumes a lot of multi-XIDs until we approach that, but that
+    # takes a very long time. So we cheat.
+    #
+    # Our strategy is to create a vanilla Postgres cluster, and use pg_resetwal to
+    # directly set the multi-xid counter a higher value. However, we cannot directly set
+    # it to just before 2^32 (~ 4 billion), because that would make the exisitng
+    # 'relminmxid' values to look like they're in the future. It's not clear how the
+    # system would behave in that situation. So instead, we bump it up ~ 1 billion
+    # multi-XIDs at a time, and let autovacuum to process all the relations and update
+    # 'relminmxid' between each run.
+    #
+    # XXX: For the multi-offsets, most of the bump is done in the last call.  This is
+    # because advancing it ~ 1 billion at a time hit a pathological case in the
+    # MultiXactMemberFreezeThreshold() function, causing autovacuum not trigger multixid
+    # freezing. See
+    # https://www.postgresql.org/message-id/85fb354c-f89f-4d47-b3a2-3cbd461c90a3%40iki.fi
+    # Multi-offsets don't have the same wraparound problems at 2 billion mark as
+    # multi-xids do, so one big jump is fine.
+    vanilla_pg.configure(
+        [
+            "log_autovacuum_min_duration = 0",
+            # Perform anti-wraparound vacuuming aggressively
+            "autovacuum_naptime='1 s'",
+            "autovacuum_freeze_max_age = 1000000",
+            "autovacuum_multixact_freeze_max_age = 1000000",
+        ],
+    )
+    vanilla_pg.start()
+    advance_multixid_to(pg_bin, vanilla_pg, 0x40000000, 0x10000000)
+    advance_multixid_to(pg_bin, vanilla_pg, 0x80000000, 0x20000000)
+    advance_multixid_to(pg_bin, vanilla_pg, 0xC0000000, 0x30000000)
+    advance_multixid_to(pg_bin, vanilla_pg, 0xFFFFFF00, 0xFFFFFF00)
+
+    vanilla_pg.safe_psql("create user cloud_admin with password 'postgres' superuser")
+    vanilla_pg.safe_psql("create table tt as select g as id from generate_series(1, 10) g")
+    vanilla_pg.safe_psql("CHECKPOINT")
+
+    # Import the cluster to the pageserver
+    tenant_id = TenantId.generate()
+    env.pageserver.tenant_create(tenant_id)
+    timeline_id = TimelineId.generate()
+    import_timeline_from_vanilla_postgres(
+        test_output_dir,
+        env,
+        pg_bin,
+        tenant_id,
+        timeline_id,
+        "imported_multixid_wraparound_test",
+        vanilla_pg.connstr(),
+    )
+    vanilla_pg.stop()
+
+    endpoint = env.endpoints.create_start(
+        "imported_multixid_wraparound_test",
+        tenant_id=tenant_id,
+        config_lines=[
+            "log_autovacuum_min_duration = 0",
+            "autovacuum_naptime='5 s'",
+            "autovacuum=off",
+        ],
+    )
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    assert query_scalar(cur, "select count(*) from tt") == 10  # sanity check
+
+    # Install extension containing function needed for test
+    cur.execute("CREATE EXTENSION neon_test_utils")
+
+    # Consume a lot of XIDs, just to advance the XIDs to different range than the
+    # multi-xids. That avoids confusion while debugging
+    cur.execute("select test_consume_xids(100000)")
+    cur.execute("select pg_switch_wal()")
+    cur.execute("checkpoint")
+
+    # Use subtransactions so that each row in 'tt' is stamped with different XID. Leave
+    # the transaction open.
+    cur.execute("BEGIN")
+    cur.execute(
+        """
+do $$
+declare
+  idvar int;
+begin
+  for idvar in select id from tt loop
+    begin
+      update tt set id = idvar where id = idvar;
+    exception when others then
+      raise 'didn''t expect an error: %', sqlerrm;
+    end;
+  end loop;
+end;
+$$;
+"""
+    )
+
+    # In a different transaction, acquire a FOR KEY SHARE lock on each row. This generates
+    # a new multixid for each row, with the previous xmax and this transaction's XID as the
+    # members.
+    #
+    # Repeat this until the multi-xid counter wraps around.
+    conn3 = endpoint.connect()
+    cur3 = conn3.cursor()
+    next_multixact_id_before_restart = 0
+    observed_before_wraparound = False
+    while True:
+        cur3.execute("BEGIN")
+        cur3.execute("SELECT * FROM tt FOR KEY SHARE")
+
+        # Get the xmax of one of the rows we locked. It should be a multi-xid. It might
+        # not be the latest one, but close enough.
+        row_xmax = int(query_scalar(cur3, "SELECT xmax FROM tt LIMIT 1"))
+        cur3.execute("COMMIT")
+        log.info(f"observed a row with xmax {row_xmax}")
+
+        # High value means not wrapped around yet
+        if row_xmax >= 0xFFFFFF00:
+            observed_before_wraparound = True
+            continue
+
+        # xmax should not be a regular XID. (We bumped up the regular XID range earlier
+        # to around 100000 and above.)
+        assert row_xmax < 100
+
+        # xmax values < FirstNormalTransactionId (== 3) could be special XID values, or
+        # multixid values after wraparound. We don't know for sure which, so keep going to
+        # be sure we see value that's unambiguously a wrapped-around multixid
+        if row_xmax < 3:
+            continue
+
+        next_multixact_id_before_restart = row_xmax
+        log.info(
+            f"next_multixact_id is now at {next_multixact_id_before_restart} or a little higher"
+        )
+        break
+
+    # We should have observed the state before wraparound
+    assert observed_before_wraparound
+
+    cur.execute("COMMIT")
+
+    # Wait until pageserver has received all the data, and restart the endpoint
+    wait_for_wal_insert_lsn(env, endpoint, tenant_id, timeline_id)
+    endpoint.stop(mode="immediate")  # 'immediate' to avoid writing shutdown checkpoint
+    endpoint.start()
+
+    # Check that the next-multixid value wrapped around correctly
+    conn = endpoint.connect()
+    cur = conn.cursor()
+    cur.execute("select next_multixact_id from pg_control_checkpoint()")
+    next_multixact_id_after_restart = int(
+        query_scalar(cur, "select next_multixact_id from pg_control_checkpoint()")
+    )
+    log.info(f"next_multixact_id after restart: {next_multixact_id_after_restart}")
+    assert next_multixact_id_after_restart >= next_multixact_id_before_restart
+
+    # The multi-offset should wrap around as well
+    cur.execute("select next_multi_offset from pg_control_checkpoint()")
+    next_multi_offset_after_restart = int(
+        query_scalar(cur, "select next_multi_offset from pg_control_checkpoint()")
+    )
+    log.info(f"next_multi_offset after restart: {next_multi_offset_after_restart}")
+    assert next_multi_offset_after_restart < 100000
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index abbea59113f1..caeae7fd15c6 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -85,8 +85,10 @@ def check_client(env: NeonEnv, client: PageserverHttpClient):
 
     # create new tenant and check it is also there
     tenant_id = TenantId.generate()
-    client.tenant_create(
-        tenant_id, generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id)
+    env.pageserver.tenant_create(
+        tenant_id,
+        generation=env.storage_controller.attach_hook_issue(tenant_id, env.pageserver.id),
+        auth_token=client.auth_token,
     )
     assert tenant_id in {TenantId(t["id"]) for t in client.tenant_list()}
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 696af24e5c0a..7ce38c5c3c82 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -249,10 +249,6 @@ def test_deferred_deletion(neon_env_builder: NeonEnvBuilder):
     assert timeline["remote_consistent_lsn"] == timeline["remote_consistent_lsn_visible"]
     assert get_deletion_queue_dropped_lsn_updates(ps_http) == 0
 
-    main_pageserver.allowed_errors.extend(
-        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
-    )
-
     # Now advance the generation in the control plane: subsequent validations
     # from the running pageserver will fail.  No more deletions should happen.
     env.storage_controller.attach_hook_issue(env.initial_tenant, other_pageserver.id)
@@ -397,8 +393,6 @@ def assert_deletions_submitted(n: int) -> None:
         #   validated before restart.
         assert get_deletion_queue_executed(ps_http) == before_restart_depth
     else:
-        main_pageserver.allowed_errors.extend([".*Dropping stale deletions.*"])
-
         # If we lost the attachment, we should have dropped our pre-restart deletions.
         assert get_deletion_queue_dropped(ps_http) == before_restart_depth
 
@@ -553,13 +547,6 @@ def test_multi_attach(
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    # We will intentionally create situations where stale deletions happen from non-latest-generation
-    # nodes when the tenant is multiply-attached
-    for ps in env.pageservers:
-        ps.allowed_errors.extend(
-            [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
-        )
-
     # Initially, the tenant will be attached to the first pageserver (first is default in our test harness)
     wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
     _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py
index aecfcdd262e5..37ff923632d2 100644
--- a/test_runner/regress/test_pageserver_reconnect.py
+++ b/test_runner/regress/test_pageserver_reconnect.py
@@ -2,6 +2,7 @@
 import time
 from contextlib import closing
 
+import psycopg2.errors
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, PgBin
 
@@ -40,3 +41,26 @@ def run_pgbench(connstr: str):
                 c.execute("select pg_reload_conf()")
 
     thread.join()
+
+
+# Test handling errors during page server reconnect
+def test_pageserver_reconnect_failure(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    env.neon_cli.create_branch("test_pageserver_reconnect")
+    endpoint = env.endpoints.create_start("test_pageserver_reconnect")
+
+    con = endpoint.connect()
+    cur = con.cursor()
+
+    cur.execute("set statement_timeout='2s'")
+    cur.execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
+    connstring = cur.fetchall()[0][0]
+    cur.execute(
+        f"alter system set neon.pageserver_connstring='{connstring}?some_invalid_param=xyz'"
+    )
+    cur.execute("select pg_reload_conf()")
+    try:
+        cur.execute("select count(*) from pg_class")
+    except psycopg2.errors.QueryCanceled:
+        log.info("Connection to PS failed")
+    assert not endpoint.log_contains("ERROR:  cannot wait on socket event without a socket.*")
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 8431840dc069..0416078ebc67 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -16,6 +16,8 @@
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
 from fixtures.utils import wait_until
 from fixtures.workload import Workload
+from werkzeug.wrappers.request import Request
+from werkzeug.wrappers.response import Response
 
 # A tenant configuration that is convenient for generating uploads and deletions
 # without a large amount of postgres traffic.
@@ -59,7 +61,7 @@ def evict_random_layers(
 
 
 @pytest.mark.parametrize("seed", [1, 2, 3])
-def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
+def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, make_httpserver, seed: int):
     """
     Issue many location configuration changes, ensure that tenants
     remain readable & we don't get any unexpected errors.  We should
@@ -73,6 +75,20 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     neon_env_builder.enable_pageserver_remote_storage(
         remote_storage_kind=s3_storage(),
     )
+    neon_env_builder.control_plane_compute_hook_api = (
+        f"http://{make_httpserver.host}:{make_httpserver.port}/notify-attach"
+    )
+
+    def ignore_notify(request: Request):
+        # This test does all its own compute configuration (by passing explicit pageserver ID to Workload functions),
+        # so we send controller notifications to /dev/null to prevent it fighting the test for control of the compute.
+        log.info(f"Ignoring storage controller compute notification: {request.json}")
+        return Response(status=200)
+
+    make_httpserver.expect_request("/notify-attach", method="PUT").respond_with_handler(
+        ignore_notify
+    )
+
     env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
 
     pageservers = env.pageservers
@@ -83,9 +99,6 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     for ps in env.pageservers:
         ps.allowed_errors.extend(
             [
-                # We will make no effort to avoid stale attachments
-                ".*Dropped remote consistent LSN updates.*",
-                ".*Dropping stale deletions.*",
                 # page_service_conn_main{peer_addr=[::1]:41176}: query handler for 'pagestream 3b19aec5038c796f64b430b30a555121 d07776761d44050b8aab511df1657d83' failed: Tenant 3b19aec5038c796f64b430b30a555121 not found
                 ".*query handler.*Tenant.*not found.*",
                 # page_service_conn_main{peer_addr=[::1]:45552}: query handler for 'pagestream 414ede7ad50f775a8e7d9ba0e43b9efc a43884be16f44b3626482b6981b2c745' failed: Tenant 414ede7ad50f775a8e7d9ba0e43b9efc is not active
@@ -102,6 +115,15 @@ def test_location_conf_churn(neon_env_builder: NeonEnvBuilder, seed: int):
     workload.init(env.pageservers[0].id)
     workload.write_rows(256, env.pageservers[0].id)
 
+    # Discourage the storage controller from interfering with the changes we will make directly on the pageserver
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {
+            "scheduling": "Stop",
+        },
+    )
+    env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy Stop.*")
+
     # We use a fixed seed to make the test reproducible: we want a randomly
     # chosen order, but not to change the order every time we run the test.
     rng = random.Random(seed)
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 756a2c17c909..54b493ec705d 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -8,8 +8,11 @@
 
 import pytest
 from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnv,
     NeonEnvBuilder,
     check_restored_datadir_content,
+    tenant_get_shards,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import s3_storage
@@ -21,6 +24,97 @@
     from pytest import CaptureFixture
 
 
+TENANT_CONF = {
+    # Scaled down thresholds so that we are exercising the pageserver beyond just writing
+    # ephemeral/L0 layers, and because debug-mode code is slow to read from full sized ephemeral layer files.
+    "pitr_interval": "60s",
+    "checkpoint_distance": f"{8 * 1024 * 1024}",
+    "compaction_target_size": f"{8 * 1024 * 1024}",
+}
+
+# # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
+# # There should have been compactions mid-test as well, this final check is in addition those.
+# for (shard, pageserver) in tenant_get_shards(env, env.initial_tenant):
+#     pageserver.http_client().timeline_checkpoint(env.initial_tenant, env.initial_timeline, force_repartition=True, force_image_layer_creation=True)
+
+
+def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: Endpoint):
+    """
+    After running some opaque tests that create interesting content in a timeline, run
+    some generic integrity checks that the storage stack is able to reproduce the written
+    data properly.
+    """
+
+    ignored_files: Optional[list[str]] = None
+
+    # Neon handles unlogged relations in a special manner. During a
+    # basebackup, we ship the init fork as the main fork. This presents a
+    # problem in that the endpoint's data directory and the basebackup will
+    # have differences and will fail the eventual file comparison.
+    #
+    # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
+    # support for setting the persistence of a table in 9.5. The reason that
+    # this doesn't affect versions < 15 (but probably would between 9.1 and
+    # 9.5) is that all the regression tests that deal with unlogged tables
+    # up until that point dropped the unlogged tables or set them to logged
+    # at some point during the test.
+    #
+    # In version 15, Postgres grew support for unlogged sequences, and with
+    # that came a few more regression tests. These tests did not all drop
+    # the unlogged tables/sequences prior to finishing.
+    #
+    # But unlogged sequences came with a bug in that, sequences didn't
+    # inherit the persistence of their "parent" tables if they had one. This
+    # was fixed and backported to 15, thus exacerbating our problem a bit.
+    #
+    # So what we can do is just ignore file differences between the data
+    # directory and basebackup for unlogged relations.
+    results = cast(
+        "list[tuple[str, str]]",
+        endpoint.safe_psql(
+            """
+        SELECT
+            relkind,
+            pg_relation_filepath(
+                pg_filenode_relation(reltablespace, relfilenode)
+            ) AS unlogged_relation_paths
+        FROM pg_class
+        WHERE relpersistence = 'u'
+        """,
+            dbname=db_name,
+        ),
+    )
+
+    unlogged_relation_files: list[str] = []
+    for r in results:
+        unlogged_relation_files.append(r[1])
+        # This is related to the following Postgres commit:
+        #
+        # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
+        # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
+        # Date:   2023-08-23 09:21:31 -0500
+        #
+        # Use the buffer cache when initializing an unlogged index.
+        #
+        # This patch was backpatched to 16. Without it, the LSN in the
+        # page header would be 0/0 in the data directory, which wouldn't
+        # match the LSN generated during the basebackup, thus creating
+        # a difference.
+        if env.pg_version <= PgVersion.V15 and r[0] == "i":
+            unlogged_relation_files.append(f"{r[1]}_init")
+
+    ignored_files = unlogged_relation_files
+
+    check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
+
+    # Ensure that compaction works, on a timeline containing all the diversity that postgres regression tests create.
+    # There should have been compactions mid-test as well, this final check is in addition those.
+    for shard, pageserver in tenant_get_shards(env, env.initial_tenant):
+        pageserver.http_client().timeline_checkpoint(
+            shard, env.initial_timeline, force_repartition=True, force_image_layer_creation=True
+        )
+
+
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
 @pytest.mark.timeout(600)
@@ -45,7 +139,10 @@ def test_pg_regress(
 
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_shard_count=shard_count,
+    )
 
     # Connect to postgres and create a database called "regression".
     endpoint = env.endpoints.create_start("main")
@@ -84,67 +181,7 @@ def test_pg_regress(
     with capsys.disabled():
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
-        ignored_files: Optional[list[str]] = None
-
-        # Neon handles unlogged relations in a special manner. During a
-        # basebackup, we ship the init fork as the main fork. This presents a
-        # problem in that the endpoint's data directory and the basebackup will
-        # have differences and will fail the eventual file comparison.
-        #
-        # Unlogged tables were introduced in version 9.1. ALTER TABLE grew
-        # support for setting the persistence of a table in 9.5. The reason that
-        # this doesn't affect versions < 15 (but probably would between 9.1 and
-        # 9.5) is that all the regression tests that deal with unlogged tables
-        # up until that point dropped the unlogged tables or set them to logged
-        # at some point during the test.
-        #
-        # In version 15, Postgres grew support for unlogged sequences, and with
-        # that came a few more regression tests. These tests did not all drop
-        # the unlogged tables/sequences prior to finishing.
-        #
-        # But unlogged sequences came with a bug in that, sequences didn't
-        # inherit the persistence of their "parent" tables if they had one. This
-        # was fixed and backported to 15, thus exacerbating our problem a bit.
-        #
-        # So what we can do is just ignore file differences between the data
-        # directory and basebackup for unlogged relations.
-        results = cast(
-            "list[tuple[str, str]]",
-            endpoint.safe_psql(
-                """
-            SELECT
-                relkind,
-                pg_relation_filepath(
-                    pg_filenode_relation(reltablespace, relfilenode)
-                ) AS unlogged_relation_paths
-            FROM pg_class
-            WHERE relpersistence = 'u'
-            """,
-                dbname=DBNAME,
-            ),
-        )
-
-        unlogged_relation_files: list[str] = []
-        for r in results:
-            unlogged_relation_files.append(r[1])
-            # This is related to the following Postgres commit:
-            #
-            # commit ccadf73163ca88bdaa74b8223d4dde05d17f550b
-            # Author: Heikki Linnakangas <heikki.linnakangas@iki.fi>
-            # Date:   2023-08-23 09:21:31 -0500
-            #
-            # Use the buffer cache when initializing an unlogged index.
-            #
-            # This patch was backpatched to 16. Without it, the LSN in the
-            # page header would be 0/0 in the data directory, which wouldn't
-            # match the LSN generated during the basebackup, thus creating
-            # a difference.
-            if env.pg_version <= PgVersion.V15 and r[0] == "i":
-                unlogged_relation_files.append(f"{r[1]}_init")
-
-        ignored_files = unlogged_relation_files
-
-        check_restored_datadir_content(test_output_dir, env, endpoint, ignored_files=ignored_files)
+    post_checks(env, test_output_dir, DBNAME, endpoint)
 
 
 # Run the PostgreSQL "isolation" tests, in src/test/isolation.
@@ -159,16 +196,20 @@ def test_isolation(
     pg_distrib_dir: Path,
     shard_count: Optional[int],
 ):
+    DBNAME = "isolation_regression"
+
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+    )
 
     # Connect to postgres and create a database called "regression".
     # isolation tests use prepared transactions, so enable them
     endpoint = env.endpoints.create_start("main", config_lines=["max_prepared_transactions=100"])
-    endpoint.safe_psql("CREATE DATABASE isolation_regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_isolation_regress to run in.
     runpath = test_output_dir / "regress"
@@ -202,6 +243,9 @@ def test_isolation(
     with capsys.disabled():
         pg_bin.run(pg_isolation_regress_command, env=env_vars, cwd=runpath)
 
+    # This fails with a mismatch on `pg_multixact/offsets/0000`
+    # post_checks(env, test_output_dir, DBNAME, endpoint)
+
 
 # Run extra Neon-specific pg_regress-based tests. The tests and their
 # schedule file are in the sql_regress/ directory.
@@ -215,15 +259,19 @@ def test_sql_regress(
     pg_distrib_dir: Path,
     shard_count: Optional[int],
 ):
+    DBNAME = "regression"
+
     if shard_count is not None:
         neon_env_builder.num_pageservers = shard_count
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     neon_env_builder.enable_scrub_on_exit()
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shard_count)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+    )
 
     # Connect to postgres and create a database called "regression".
     endpoint = env.endpoints.create_start("main")
-    endpoint.safe_psql("CREATE DATABASE regression")
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
 
     # Create some local directories for pg_regress to run in.
     runpath = test_output_dir / "regress"
@@ -258,4 +306,4 @@ def test_sql_regress(
     with capsys.disabled():
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
-        check_restored_datadir_content(test_output_dir, env, endpoint)
+    post_checks(env, test_output_dir, DBNAME, endpoint)
diff --git a/test_runner/regress/test_physical_replication.py b/test_runner/regress/test_physical_replication.py
index a1bff32eedd1..043aff686b09 100644
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -1,7 +1,11 @@
+from __future__ import annotations
+
 import random
 import time
+from typing import TYPE_CHECKING
 
-from fixtures.neon_fixtures import NeonEnv
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv
 
 
 def test_physical_replication(neon_simple_env: NeonEnv):
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index b26bd3422f30..fac7fe9deef6 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -355,13 +355,6 @@ def churn_while_failpoints_active(result):
     env.pageserver.stop(immediate=True)
     env.endpoints.stop_all()
 
-    # We are about to forcibly drop local dirs.  Storage controller will increment generation in re-attach before
-    # we later increment when actually attaching it again, leading to skipping a generation and potentially getting
-    # these warnings if there was a durable but un-executed deletion list at time of restart.
-    env.pageserver.allowed_errors.extend(
-        [".*Dropped remote consistent LSN updates.*", ".*Dropping stale deletions.*"]
-    )
-
     dir_to_clear = env.pageserver.tenant_dir()
     shutil.rmtree(dir_to_clear)
     os.mkdir(dir_to_clear)
diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py
new file mode 100644
index 000000000000..17d476a8a690
--- /dev/null
+++ b/test_runner/regress/test_replica_start.py
@@ -0,0 +1,646 @@
+"""
+In PostgreSQL, a standby always has to wait for a running-xacts WAL record to
+arrive before it can start accepting queries. Furthermore, if there are
+transactions with too many subxids (> 64) open to fit in the in-memory subxids
+cache, the running-xacts record will be marked as "suboverflowed", and the
+standby will need to also wait for the currently in-progress transactions to
+finish.
+
+In Neon, we have an additional mechanism that scans the CLOG at server startup
+to determine the list of running transactions, so that the standby can start up
+immediately without waiting for the running-xacts record, but that mechanism
+only works if the # of active (sub-)transactions is reasonably small. Otherwise
+it falls back to waiting. Furthermore, it's somewhat optimistic in using up the
+known-assigned XIDs array: if too many transactions with subxids are started in
+the primary later, the replay in the replica will crash with "too many
+KnownAssignedXids" error.
+
+This module contains tests for those various cases at standby startup: starting
+from shutdown checkpoint, using the CLOG scanning mechanism, waiting for
+running-xacts record and for in-progress transactions to finish etc.
+"""
+
+import threading
+from contextlib import closing
+
+import psycopg2
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup
+from fixtures.pg_version import PgVersion
+from fixtures.utils import query_scalar, wait_until
+
+CREATE_SUBXACTS_FUNC = """
+create or replace function create_subxacts(n integer) returns void as $$
+declare
+   i integer;
+begin
+   for i in 1..n loop
+      begin
+         insert into t (payload) values (0);
+      exception
+         when others then
+            raise exception 'caught something: %', sqlerrm;
+      end;
+   end loop;
+end; $$ language plpgsql
+"""
+
+
+def test_replica_start_scan_clog(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup. There is one
+    transaction active in the primary when the standby is started. The primary
+    is killed before it has a chance to write a running-xacts record. The
+    CLOG-scanning at neon startup allows the standby to start up anyway.
+
+    See the module docstring for background.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+    primary_cur.execute("select pg_switch_wal()")
+
+    # Start a transaction in the primary. Leave the transaction open.
+    #
+    # The transaction has some subtransactions, but not too many to cause the
+    # CLOG-scanning mechanism to give up.
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(50)")
+
+    # Wait for the WAL to be flushed, but then immediately kill the primary,
+    # before it has a chance to generate a running-xacts record.
+    primary_cur.execute("select neon_xlogflush()")
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+    primary.stop(mode="immediate")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+
+    # The transaction did not commit, so it should not be visible in the secondary
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (0,)
+
+
+def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup, after
+    leaving behind crashed transactions.
+
+    See the module docstring for background.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+    primary_cur.execute("select pg_switch_wal()")
+
+    # Consume a lot of XIDs, then kill Postgres without giving it a
+    # chance to write abort records for them.
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(100000)")
+    primary.stop(mode="immediate")
+
+    # Restart the primary. Do some light work, and shut it down cleanly
+    primary.start()
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("insert into t (payload) values (0)")
+    primary.stop(mode="fast")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism. (Restarting the primary writes a checkpoint and/or running-xacts
+    # record, which allows the standby to know that the crashed XIDs are aborted)
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (1,)
+
+
+def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version):
+    """
+    Test that starting a replica works right after the primary has
+    created a running-xacts record. This may seem like a trivial case,
+    but during development, we had a bug that was triggered by having
+    oldestActiveXid == nextXid. Starting right after a running-xacts
+    record is one way to test that case.
+
+    See the module docstring for background.
+    """
+    env = neon_simple_env
+
+    if env.pg_version == PgVersion.V14 or env.pg_version == PgVersion.V15:
+        pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
+
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+    primary_cur.execute("select pg_log_standby_snapshot()")
+    primary_cur.execute("select neon_xlogflush()")
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select 123")
+    assert secondary_cur.fetchone() == (123,)
+
+
+def test_replica_start_wait_subxids_finish(neon_simple_env: NeonEnv):
+    """
+    Test replica startup when there are a lot of (sub)transactions active in the
+    primary. That's too many for the CLOG-scanning mechanism to handle, so the
+    replica has to wait for the large transaction to finish before it starts to
+    accept queries.
+
+    After replica startup, test MVCC with transactions that were in-progress
+    when the replica was started.
+
+    See the module docstring for background.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create
+    # lots of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Start a transaction with 100000 subtransactions, and leave it open. That's
+    # too many to fit in the "known-assigned XIDs array" in the replica, and
+    # also too many to fit in the subxid caches so the running-xacts record will
+    # also overflow.
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(100000)")
+
+    # Start another, smaller transaction in the primary. We'll come back to this
+    # later.
+    primary_conn2 = primary.connect()
+    primary_cur2 = primary_conn2.cursor()
+    primary_cur2.execute("begin")
+    primary_cur2.execute("insert into t (payload) values (0)")
+
+    # Create a replica. but before that, wait for the wal to be flushed to
+    # safekeepers, so that the replica is started at a point where the large
+    # transaction is already active. (The whole transaction might not be flushed
+    # yet, but that's OK.)
+    #
+    # Start it in a separate thread, so that we can do other stuff while it's
+    # blocked waiting for the startup to finish.
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+    secondary = env.endpoints.new_replica(origin=primary, endpoint_id="secondary")
+    start_secondary_thread = threading.Thread(target=secondary.start)
+    start_secondary_thread.start()
+
+    # Verify that the replica has otherwise started up, but cannot start
+    # accepting queries yet.
+    log.info("Waiting 5 s to verify that the secondary does not start")
+    start_secondary_thread.join(5)
+    assert secondary.log_contains("consistent recovery state reached")
+    assert secondary.log_contains("started streaming WAL from primary")
+    # The "redo starts" message is printed when the first WAL record is
+    # received. It might or might not be present in the log depending on how
+    # far exactly the WAL was flushed when the replica was started, and whether
+    # background activity caused any more WAL records to be flushed on the
+    # primary afterwards.
+    #
+    # assert secondary.log_contains("redo # starts")
+
+    # should not be open for connections yet
+    assert start_secondary_thread.is_alive()
+    assert not secondary.is_running()
+    assert not secondary.log_contains("database system is ready to accept read-only connections")
+
+    # Commit the large transaction in the primary.
+    #
+    # Within the next 15 s, the primary should write a new running-xacts record
+    # to the WAL which shows the transaction as completed. Once the replica
+    # replays that record, it will start accepting queries.
+    primary_cur.execute("commit")
+    start_secondary_thread.join()
+
+    # Verify that the large transaction is correctly visible in the secondary
+    # (but not the second, small transaction, which is still in-progress!)
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100000,)
+
+    # Perform some more MVCC testing using the second transaction that was
+    # started in the primary before the replica was created
+    primary_cur2.execute("select create_subxacts(10000)")
+
+    # The second transaction still hasn't committed
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("BEGIN ISOLATION LEVEL REPEATABLE READ")
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100000,)
+
+    # Commit the second transaction in the primary
+    primary_cur2.execute("commit")
+
+    # Should still be invisible to the old snapshot
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100000,)
+
+    # Commit the REPEATABLE READ transaction in the replica. Both
+    # primary transactions should now be visible to a new snapshot.
+    secondary_cur.execute("commit")
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (110001,)
+
+
+def test_replica_too_many_known_assigned_xids(neon_simple_env: NeonEnv):
+    """
+    The CLOG-scanning mechanism fills the known-assigned XIDs array
+    optimistically at standby startup, betting that it can still fit
+    upcoming transactions replayed later from the WAL in the
+    array. This test tests what happens when that bet fails and the
+    known-assigned XID array fills up after the standby has already
+    been started. The WAL redo will fail with an error:
+
+    FATAL:  too many KnownAssignedXids
+    CONTEXT:  WAL redo at 0/1895CB0 for neon/INSERT: off: 25, flags: 0x08; blkref #0: rel 1663/5/16385, blk 64
+
+    which causes the standby to shut down.
+
+    See the module docstring for background.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Determine how many connections we can use
+    primary_cur.execute("show max_connections")
+    max_connections = int(primary_cur.fetchall()[0][0])
+    primary_cur.execute("show superuser_reserved_connections")
+    superuser_reserved_connections = int(primary_cur.fetchall()[0][0])
+    n_connections = max_connections - superuser_reserved_connections
+    n_subxids = 200
+
+    # Start one top transaction in primary, with lots of subtransactions. This
+    # uses up much of the known-assigned XIDs space in the standby, but doesn't
+    # cause it to overflow.
+    large_p_conn = primary.connect()
+    large_p_cur = large_p_conn.cursor()
+    large_p_cur.execute("begin")
+    large_p_cur.execute(f"select create_subxacts({max_connections} * 30)")
+
+    with closing(primary.connect()) as small_p_conn:
+        with small_p_conn.cursor() as small_p_cur:
+            small_p_cur.execute("select create_subxacts(1)")
+
+    # Create a replica at this LSN
+    primary_cur.execute("select neon_xlogflush()")
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+
+    # The transaction in primary has not committed yet.
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (1,)
+
+    # Start max number of top transactions in primary, with a lot of
+    # subtransactions each. We add the subtransactions to each top transaction
+    # in a round-robin fashion, instead of adding a lot of subtransactions to
+    # one top transaction at a time. This way, we will have the max number of
+    # subtransactions in the in-memory subxid cache of each top transaction,
+    # until they all overflow.
+    #
+    # Currently, PGPROC_MAX_CACHED_SUBXIDS == 64, so this will overflow the all
+    # the subxid caches after creating 64 subxids in each top transaction. The
+    # point just before the caches have overflowed is the most interesting point
+    # in time, but we'll keep going beyond that, to ensure that this test is
+    # robust even if PGPROC_MAX_CACHED_SUBXIDS changes.
+    p_curs = []
+    for _ in range(0, n_connections):
+        p_cur = primary.connect().cursor()
+        p_cur.execute("begin")
+        p_curs.append(p_cur)
+
+    for _subxid in range(0, n_subxids):
+        for i in range(0, n_connections):
+            p_curs[i].execute("select create_subxacts(1)")
+
+    # Commit all the transactions in the primary
+    for i in range(0, n_connections):
+        p_curs[i].execute("commit")
+    large_p_cur.execute("commit")
+
+    # Wait until the replica crashes with "too many KnownAssignedXids" error.
+    def check_replica_crashed():
+        try:
+            secondary.connect()
+        except psycopg2.Error:
+            # Once the connection fails, return success
+            return None
+        raise RuntimeError("connection succeeded")
+
+    wait_until(20, 0.5, check_replica_crashed)
+    assert secondary.log_contains("too many KnownAssignedXids")
+
+    # Replica is crashed, so ignore stop result
+    secondary.check_stop_result = False
+
+
+def test_replica_start_repro_visibility_bug(neon_simple_env: NeonEnv):
+    """
+    Before PR #7288, a hot standby in neon incorrectly started up
+    immediately, before it had received a running-xacts record. That
+    led to visibility bugs if there were active transactions in the
+    primary. This test reproduces the incorrect query results and
+    incorrectly set hint bits, before that was fixed.
+    """
+    env = neon_simple_env
+
+    primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    p_cur = primary.connect().cursor()
+
+    p_cur.execute("begin")
+    p_cur.execute("create table t(pk integer primary key, payload integer)")
+    p_cur.execute("insert into t values (generate_series(1,100000), 0)")
+
+    secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+    wait_replica_caughtup(primary, secondary)
+    s_cur = secondary.connect().cursor()
+
+    # Set hint bits for pg_class tuples. If primary's transaction is
+    # not marked as in-progress in MVCC snapshot, then XMIN_INVALID
+    # hint bit will be set for table's 't' tuple, making it invisible
+    # even after the commit record is replayed later.
+    s_cur.execute("select * from pg_class")
+
+    p_cur.execute("commit")
+    wait_replica_caughtup(primary, secondary)
+    s_cur.execute("select * from t where pk = 1")
+    assert s_cur.fetchone() == (1, 0)
+
+
+@pytest.mark.parametrize("shutdown", [True, False])
+def test_replica_start_with_prepared_xacts(neon_simple_env: NeonEnv, shutdown: bool):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup in the presence of
+    prepared transactions.
+
+    This test is run in two variants: one where the primary server is shut down
+    before starting the secondary, or not.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"]
+    )
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute("create table t1(pk integer primary key)")
+    primary_cur.execute("create table t2(pk integer primary key)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Prepare a transaction for two-phase commit
+    primary_cur.execute("begin")
+    primary_cur.execute("insert into t1 values (1)")
+    primary_cur.execute("prepare transaction 't1'")
+
+    # Prepare another transaction for two-phase commit, with a subtransaction
+    primary_cur.execute("begin")
+    primary_cur.execute("insert into t2 values (2)")
+    primary_cur.execute("savepoint sp")
+    primary_cur.execute("insert into t2 values (3)")
+    primary_cur.execute("prepare transaction 't2'")
+
+    # Start a transaction in the primary. Leave the transaction open.
+    #
+    # The transaction has some subtransactions, but not too many to cause the
+    # CLOG-scanning mechanism to give up.
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(50)")
+
+    # Wait for the WAL to be flushed
+    primary_cur.execute("select neon_xlogflush()")
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+
+    if shutdown:
+        primary.stop(mode="fast")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(
+        origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"]
+    )
+
+    # The transaction did not commit, so it should not be visible in the secondary
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (0,)
+    secondary_cur.execute("select count(*) from t1")
+    assert secondary_cur.fetchone() == (0,)
+    secondary_cur.execute("select count(*) from t2")
+    assert secondary_cur.fetchone() == (0,)
+
+    if shutdown:
+        primary.start()
+        primary_conn = primary.connect()
+        primary_cur = primary_conn.cursor()
+    else:
+        primary_cur.execute("commit")
+    primary_cur.execute("commit prepared 't1'")
+    primary_cur.execute("commit prepared 't2'")
+
+    wait_replica_caughtup(primary, secondary)
+
+    secondary_cur.execute("select count(*) from t")
+    if shutdown:
+        assert secondary_cur.fetchone() == (0,)
+    else:
+        assert secondary_cur.fetchone() == (50,)
+    secondary_cur.execute("select * from t1")
+    assert secondary_cur.fetchall() == [(1,)]
+    secondary_cur.execute("select * from t2")
+    assert secondary_cur.fetchall() == [(2,), (3,)]
+
+
+def test_replica_start_with_prepared_xacts_with_subxacts(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup in the presence of
+    prepared transactions, with subtransactions.
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"]
+    )
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+
+    # Install extension containing function needed for test
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Advance nextXid close to the beginning of the next pg_subtrans segment (2^16 XIDs)
+    #
+    # This is interesting, because it tests that pg_subtrans is initialized correctly
+    # at standby startup. (We had a bug where it didn't at one point during development.)
+    while True:
+        xid = int(query_scalar(primary_cur, "SELECT txid_current()"))
+        log.info(f"xid now {xid}")
+        # Consume 500 transactions at a time until we get close
+        if xid < 65535 - 600:
+            primary_cur.execute("select test_consume_xids(500);")
+        else:
+            break
+    primary_cur.execute("checkpoint")
+
+    # Prepare a transaction for two-phase commit
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(1000)")
+    primary_cur.execute("prepare transaction 't1'")
+
+    # Wait for the WAL to be flushed, and stop the primary
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+    primary.stop(mode="fast")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(
+        origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"]
+    )
+
+    # The transaction did not commit, so it should not be visible in the secondary
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (0,)
+
+    primary.start()
+
+    # Open a lot of subtransactions in the primary, causing the subxids cache to overflow
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("select create_subxacts(100000)")
+
+    wait_replica_caughtup(primary, secondary)
+
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100000,)
+
+    primary_cur.execute("commit prepared 't1'")
+
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (101000,)
+
+
+def test_replica_start_with_prepared_xacts_with_many_subxacts(neon_simple_env: NeonEnv):
+    """
+    Test the CLOG-scanning mechanism at hot standby startup in the presence of
+    prepared transactions, with lots of subtransactions.
+
+    Like test_replica_start_with_prepared_xacts_with_subxacts, but with more
+    subxacts, to test that the prepared transaction's subxids don't consume
+    space in the known-assigned XIDs array. (They are set in pg_subtrans
+    instead)
+    """
+
+    # Initialize the primary, a test table, and a helper function to create lots
+    # of subtransactions.
+    env = neon_simple_env
+    primary = env.endpoints.create_start(
+        branch_name="main", endpoint_id="primary", config_lines=["max_prepared_transactions=5"]
+    )
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+
+    # Install extension containing function needed for test
+    primary_cur.execute("CREATE EXTENSION neon_test_utils")
+
+    primary_cur.execute("create table t(pk serial primary key, payload integer)")
+    primary_cur.execute(CREATE_SUBXACTS_FUNC)
+
+    # Prepare a transaction for two-phase commit, with lots of subxids
+    primary_cur.execute("begin")
+    primary_cur.execute("select create_subxacts(50000)")
+
+    # to make things a bit more varied, intersperse a few other XIDs in between
+    # the prepared transaction's sub-XIDs
+    with primary.connect().cursor() as primary_cur2:
+        primary_cur2.execute("insert into t (payload) values (123)")
+        primary_cur2.execute("begin; insert into t (payload) values (-1); rollback")
+
+    primary_cur.execute("select create_subxacts(50000)")
+    primary_cur.execute("prepare transaction 't1'")
+
+    # Wait for the WAL to be flushed
+    wait_for_last_flush_lsn(env, primary, env.initial_tenant, env.initial_timeline)
+
+    primary.stop(mode="fast")
+
+    # Create a replica. It should start up normally, thanks to the CLOG-scanning
+    # mechanism.
+    secondary = env.endpoints.new_replica_start(
+        origin=primary, endpoint_id="secondary", config_lines=["max_prepared_transactions=5"]
+    )
+
+    # The transaction did not commit, so it should not be visible in the secondary
+    secondary_conn = secondary.connect()
+    secondary_cur = secondary_conn.cursor()
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (1,)
+
+    primary.start()
+
+    # Open a lot of subtransactions in the primary, causing the subxids cache to overflow
+    primary_conn = primary.connect()
+    primary_cur = primary_conn.cursor()
+    primary_cur.execute("select create_subxacts(100000)")
+
+    wait_replica_caughtup(primary, secondary)
+
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (100001,)
+
+    primary_cur.execute("commit prepared 't1'")
+
+    wait_replica_caughtup(primary, secondary)
+    secondary_cur.execute("select count(*) from t")
+    assert secondary_cur.fetchone() == (200001,)
diff --git a/test_runner/regress/test_replication_start.py b/test_runner/regress/test_replication_start.py
deleted file mode 100644
index 236074599021..000000000000
--- a/test_runner/regress/test_replication_start.py
+++ /dev/null
@@ -1,32 +0,0 @@
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonEnv, wait_replica_caughtup
-
-
-@pytest.mark.xfail
-def test_replication_start(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-
-    with env.endpoints.create_start(branch_name="main", endpoint_id="primary") as primary:
-        with primary.connect() as p_con:
-            with p_con.cursor() as p_cur:
-                p_cur.execute("begin")
-                p_cur.execute("create table t(pk integer primary key, payload integer)")
-                p_cur.execute("insert into t values (generate_series(1,100000), 0)")
-                p_cur.execute("select txid_current()")
-                xid = p_cur.fetchall()[0][0]
-                log.info(f"Master transaction {xid}")
-                with env.endpoints.new_replica_start(
-                    origin=primary, endpoint_id="secondary"
-                ) as secondary:
-                    wait_replica_caughtup(primary, secondary)
-                    with secondary.connect() as s_con:
-                        with s_con.cursor() as s_cur:
-                            # Enforce setting hint bits for pg_class tuples.
-                            # If master's transaction is not marked as in-progress in MVCC snapshot,
-                            # then XMIN_INVALID hint bit will be set for table's 't' tuple makeing it invisible.
-                            s_cur.execute("select * from pg_class")
-                            p_cur.execute("commit")
-                            wait_replica_caughtup(primary, secondary)
-                            s_cur.execute("select * from t where pk = 1")
-                            assert s_cur.fetchone() == (1, 0)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 62a9f422ee4d..4471237900b8 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -225,6 +225,12 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
     workload.validate()
     workload.stop()
 
+    # Do a full image layer generation before splitting, so that when we compact after splitting
+    # we should only see sizes decrease (from post-split drops/rewrites), not increase (from image layer generation)
+    env.get_tenant_pageserver(tenant_id).http_client().timeline_compact(
+        tenant_id, timeline_id, force_image_layer_creation=True, wait_until_uploaded=True
+    )
+
     # Split one shard into two
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
 
@@ -542,6 +548,13 @@ def check_effective_tenant_config():
             for k, v in non_default_tenant_config.items():
                 assert config.effective_config[k] == v
 
+            # Check that heatmap uploads remain enabled after shard split
+            # (https://github.com/neondatabase/neon/issues/8189)
+            assert (
+                config.effective_config["heatmap_period"]
+                and config.effective_config["heatmap_period"] != "0s"
+            )
+
     # Validate pageserver state: expect every child shard to have an attached and secondary location
     (total, attached) = get_node_shard_counts(env, tenant_ids=[tenant_id])
     assert sum(attached.values()) == split_shard_count
@@ -1137,10 +1150,6 @@ def test_sharding_split_failures(
     )
 
     for ps in env.pageservers:
-        # When we do node failures and abandon a shard, it will de-facto have old generation and
-        # thereby be unable to publish remote consistent LSN updates
-        ps.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
         # If we're using a failure that will panic the storage controller, all background
         # upcalls from the pageserver can fail
         ps.allowed_errors.append(".*calling control plane generation validation API failed.*")
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 9cc13ecfdbca..d37f7aae3dfd 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -60,11 +60,6 @@ def test_storage_controller_smoke(
     neon_env_builder.num_pageservers = 3
     env = neon_env_builder.init_configs()
 
-    for pageserver in env.pageservers:
-        # This test detaches tenants during migration, which can race with deletion queue operations,
-        # during detach we only do an advisory flush, we don't wait for it.
-        pageserver.allowed_errors.extend([".*Dropped remote consistent LSN updates.*"])
-
     # Start services by hand so that we can skip a pageserver (this will start + register later)
     env.broker.try_start()
     env.storage_controller.start()
@@ -315,7 +310,7 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     # Create a tenant directly via pageserver HTTP API, skipping the storage controller
     tenant_id = TenantId.generate()
     generation = 123
-    origin_ps.http_client().tenant_create(tenant_id, generation=generation)
+    origin_ps.tenant_create(tenant_id, generation=generation)
 
     # As if doing a live migration, first configure origin into stale mode
     r = origin_ps.http_client().tenant_location_conf(
@@ -484,9 +479,6 @@ def handler(request: Request):
     # Start running
     env = neon_env_builder.init_start()
 
-    # We will to an unclean migration, which will result in deletion queue warnings
-    env.pageservers[0].allowed_errors.append(".*Dropped remote consistent LSN updates for tenant.*")
-
     # Initial notification from tenant creation
     assert len(notifications) == 1
     expect: Dict[str, Union[List[Dict[str, int]], str, None, int]] = {
@@ -1054,13 +1046,6 @@ def tenants_placed():
     online_node_ids = set(range(1, len(env.pageservers) + 1)) - offline_node_ids
 
     for node_id in offline_node_ids:
-        env.get_pageserver(node_id).allowed_errors.append(
-            # In the case of the failpoint failure, the impacted pageserver
-            # still believes it has the tenant attached since location
-            # config calls into it will fail due to being marked offline.
-            ".*Dropped remote consistent LSN updates.*",
-        )
-
         if len(offline_node_ids) > 1:
             env.get_pageserver(node_id).allowed_errors.append(
                 ".*Scheduling error when marking pageserver.*offline.*",
@@ -1518,49 +1503,6 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto
         workload.validate()
 
 
-def retryable_node_operation(op, ps_id, max_attempts, backoff):
-    while max_attempts > 0:
-        try:
-            op(ps_id)
-            return
-        except StorageControllerApiException as e:
-            max_attempts -= 1
-            log.info(f"Operation failed ({max_attempts} attempts left): {e}")
-
-            if max_attempts == 0:
-                raise e
-
-            time.sleep(backoff)
-
-
-def poll_node_status(env, node_id, desired_scheduling_policy, max_attempts, backoff):
-    log.info(f"Polling {node_id} for {desired_scheduling_policy} scheduling policy")
-    while max_attempts > 0:
-        try:
-            status = env.storage_controller.node_status(node_id)
-            policy = status["scheduling"]
-            if policy == desired_scheduling_policy:
-                return
-            else:
-                max_attempts -= 1
-                log.info(f"Status call returned {policy=} ({max_attempts} attempts left)")
-
-                if max_attempts == 0:
-                    raise AssertionError(
-                        f"Status for {node_id=} did not reach {desired_scheduling_policy=}"
-                    )
-
-                time.sleep(backoff)
-        except StorageControllerApiException as e:
-            max_attempts -= 1
-            log.info(f"Status call failed ({max_attempts} retries left): {e}")
-
-            if max_attempts == 0:
-                raise e
-
-            time.sleep(backoff)
-
-
 def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder):
     """
     Graceful reststart of storage controller clusters use the drain and
@@ -1601,10 +1543,10 @@ def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards):
 
     # Perform a graceful rolling restart
     for ps in env.pageservers:
-        retryable_node_operation(
+        env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
         )
-        poll_node_status(env, ps.id, "PauseForRestart", max_attempts=6, backoff=5)
+        env.storage_controller.poll_node_status(ps.id, "PauseForRestart", max_attempts=6, backoff=5)
 
         shard_counts = get_node_shard_counts(env, tenant_ids)
         log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
@@ -1614,12 +1556,12 @@ def assert_shard_counts_balanced(env: NeonEnv, shard_counts, total_shards):
         assert sum(shard_counts.values()) == total_shards
 
         ps.restart()
-        poll_node_status(env, ps.id, "Active", max_attempts=10, backoff=1)
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=10, backoff=1)
 
-        retryable_node_operation(
+        env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
         )
-        poll_node_status(env, ps.id, "Active", max_attempts=6, backoff=5)
+        env.storage_controller.poll_node_status(ps.id, "Active", max_attempts=6, backoff=5)
 
         shard_counts = get_node_shard_counts(env, tenant_ids)
         log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
@@ -1636,7 +1578,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_configs()
     env.start()
 
-    tenant_count = 5
+    tenant_count = 10
     shard_count_per_tenant = 8
     tenant_ids = []
 
@@ -1657,15 +1599,15 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
 
     ps_id_to_drain = env.pageservers[0].id
 
-    retryable_node_operation(
+    env.storage_controller.retryable_node_operation(
         lambda ps_id: env.storage_controller.node_drain(ps_id),
         ps_id_to_drain,
         max_attempts=3,
         backoff=2,
     )
 
-    poll_node_status(env, ps_id_to_drain, "Draining", max_attempts=6, backoff=2)
+    env.storage_controller.poll_node_status(ps_id_to_drain, "Draining", max_attempts=6, backoff=2)
 
     env.storage_controller.cancel_node_drain(ps_id_to_drain)
 
-    poll_node_status(env, ps_id_to_drain, "Active", max_attempts=6, backoff=2)
+    env.storage_controller.poll_node_status(ps_id_to_drain, "Active", max_attempts=6, backoff=2)
diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index d7f396262059..91caad722051 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -54,4 +54,4 @@ def insert_data(pub):
         pcur.execute(f"INSERT into t values ({n_records}, 0)")
         n_records += 1
         with sub.cursor() as scur:
-            wait_until(10, 0.5, check_that_changes_propagated)
+            wait_until(60, 0.5, check_that_changes_propagated)
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 2cbb036c0d7c..80fb2b55b8b2 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -320,10 +320,6 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
 
     assert not config_path.exists(), "detach did not remove config file"
 
-    # The re-attach's increment of the generation number may invalidate deletion queue
-    # updates in flight from the previous attachment.
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     env.pageserver.tenant_attach(tenant_id)
     wait_until(
         number_of_iterations=5,
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index d3fba32a19e0..1d7c8b8e31f0 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -67,8 +67,9 @@ def test_tenant_delete_smoke(
 
     # first try to delete non existing tenant
     tenant_id = TenantId.generate()
-    env.pageserver.allowed_errors.append(".*NotFound.*")
-    env.pageserver.allowed_errors.append(".*simulated failure.*")
+    env.pageserver.allowed_errors.extend(
+        [".*NotFound.*", ".*simulated failure.*", ".*failed to delete .+ objects.*"]
+    )
 
     # Check that deleting a non-existent tenant gives the expected result: this is a loop because we
     # may need to retry on some remote storage errors injected by the test harness
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 2056840558e6..b165588636c7 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -76,10 +76,6 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str):
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
-    # Our re-attach may race with the deletion queue processing LSN updates
-    # from the original attachment.
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
         with endpoint.cursor() as cur:
             cur.execute("CREATE TABLE t(key int primary key, value text)")
@@ -349,10 +345,6 @@ def test_detach_while_attaching(
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
-    # Our re-attach may race with the deletion queue processing LSN updates
-    # from the original attachment.
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     # Create table, and insert some rows. Make it big enough that it doesn't fit in
     # shared_buffers, otherwise the SELECT after restart will just return answer
     # from shared_buffers without hitting the page server, which defeats the point
@@ -422,10 +414,6 @@ def test_detach_while_activating(
 
     env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS)
 
-    # Our re-attach may race with the deletion queue processing LSN updates
-    # from the original attachment.
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
-
     data_id = 1
     data_secret = "very secret secret"
     insert_test_data(pageserver_http, tenant_id, timeline_id, data_id, data_secret, endpoint)
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 9fe732e28806..43e9a0d36e80 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -203,8 +203,6 @@ def test_tenant_relocation(
         [
             # Needed for detach polling on the original pageserver
             f".*NotFound: tenant {tenant_id}.*",
-            # We will dual-attach in this test, so stale generations are expected
-            ".*Dropped remote consistent LSN updates.*",
         ]
     )
 
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 6c85ddebbcfb..b1ade77a1474 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -10,6 +10,7 @@
     Endpoint,
     NeonEnv,
     NeonEnvBuilder,
+    flush_ep_to_pageserver,
     wait_for_last_flush_lsn,
     wait_for_wal_insert_lsn,
 )
@@ -710,3 +711,115 @@ def mask_model_inputs(x):
         return newlist
     else:
         return x
+
+
+@pytest.mark.parametrize("zero_gc", [True, False])
+def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, zero_gc: bool):
+    """
+    Compare a LSN lease to a read-only branch for synthetic size calculation.
+    They should have the same effect.
+    """
+
+    def assert_size_approx_equal_for_lease_test(size_lease, size_branch):
+        """
+        Tests that evaluate sizes are checking the pageserver space consumption
+        that sits many layers below the user input.  The exact space needed
+        varies slightly depending on postgres behavior.
+
+        Rather than expecting postgres to be determinstic and occasionally
+        failing the test, we permit sizes for the same data to vary by a few pages.
+        """
+
+        # FIXME(yuchen): The delta is too large, used as temp solution to pass the test reliably.
+        # Investigate and reduce the threshold.
+        threshold = 22 * 8272
+
+        log.info(
+            f"delta: size_branch({size_branch}) -  size_lease({size_lease}) = {size_branch - size_lease}"
+        )
+
+        assert size_lease == pytest.approx(size_branch, abs=threshold)
+
+    conf = {
+        "pitr_interval": "0s" if zero_gc else "3600s",
+        "gc_period": "0s",
+        "compaction_period": "0s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=conf)
+
+    ro_branch_res = insert_with_action(
+        env, env.initial_tenant, env.initial_timeline, test_output_dir, action="branch"
+    )
+
+    tenant, timeline = env.neon_cli.create_tenant(conf=conf)
+    lease_res = insert_with_action(env, tenant, timeline, test_output_dir, action="lease")
+
+    assert_size_approx_equal_for_lease_test(lease_res, ro_branch_res)
+
+
+def insert_with_action(
+    env: NeonEnv,
+    tenant: TenantId,
+    timeline: TimelineId,
+    test_output_dir: Path,
+    action: str,
+) -> int:
+    """
+    Inserts some data on the timeline, perform an action, and insert more data on the same timeline.
+    Returns the size at the end of the insertion.
+
+    Valid actions:
+     - "lease": Acquires a lease.
+     - "branch": Creates a child branch but never writes to it.
+    """
+
+    client = env.pageserver.http_client()
+    with env.endpoints.create_start(
+        "main",
+        tenant_id=tenant,
+        config_lines=["autovacuum=off"],
+    ) as ep:
+        initial_size = client.tenant_size(tenant)
+        log.info(f"initial size: {initial_size}")
+
+        with ep.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+        last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
+
+        if action == "lease":
+            res = client.timeline_lsn_lease(tenant, timeline, last_flush_lsn)
+            log.info(f"result from lsn_lease api: {res}")
+        elif action == "branch":
+            ro_branch = env.neon_cli.create_branch(
+                "ro_branch", tenant_id=tenant, ancestor_start_lsn=last_flush_lsn
+            )
+            log.info(f"{ro_branch=} created")
+        else:
+            raise AssertionError("Invalid action type, only `lease` and `branch`are accepted")
+
+        with ep.cursor() as cur:
+            cur.execute(
+                "CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+            cur.execute(
+                "CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+            cur.execute(
+                "CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
+            )
+
+        last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
+
+        # Avoid flakiness when calculating logical size.
+        flush_ep_to_pageserver(env, ep, tenant, timeline)
+
+        size_after_action_and_insert = client.tenant_size(tenant)
+        log.info(f"{size_after_action_and_insert=}")
+
+        size_debug_file = open(test_output_dir / f"size_debug_{action}.html", "w")
+        size_debug = client.tenant_size_debug(tenant)
+        size_debug_file.write(size_debug)
+        return size_after_action_and_insert
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 93e9ad367367..04b3fdd80fa5 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -41,18 +41,35 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
     neon_simple_env.storage_controller.allowed_errors.extend(error_regexes)
 
     pageserver_http = neon_simple_env.pageserver.http_client()
+
+    # Failure to write a config to local disk makes the pageserver assume that local disk is bad and abort the process
     pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))
-    with pytest.raises(Exception, match="tenant-config-before-write"):
+
+    # Storage controller will see a torn TCP connection when the crash point is reached, and follow an unclean 500 error path
+    neon_simple_env.storage_controller.allowed_errors.extend(
+        [
+            ".*Reconcile not done yet while creating tenant.*",
+            ".*Reconcile error: receive body: error sending request.*",
+            ".*Error processing HTTP request: InternalServerError.*",
+        ]
+    )
+
+    with pytest.raises(Exception, match="error sending request"):
         _ = neon_simple_env.neon_cli.create_tenant()
 
+    # Any files left behind on disk during failed creation do not prevent
+    # a retry from succeeding.  Restart pageserver with no failpoints.
+    neon_simple_env.pageserver.running = False
+    neon_simple_env.pageserver.start()
+
+    # The failed creation should not be present in list of tenants, as when we start up we'll see
+    # an empty tenant dir with no config in it.
+    neon_simple_env.pageserver.allowed_errors.append(".*Failed to load tenant config.*")
     new_tenants = sorted(
         map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines())
     )
     assert initial_tenants == new_tenants, "should not create new tenants"
 
-    # Any files left behind on disk during failed creation do not prevent
-    # a retry from succeeding.
-    pageserver_http.configure_failpoints(("tenant-config-before-write", "off"))
     neon_simple_env.neon_cli.create_tenant()
 
 
@@ -369,10 +386,6 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
     # generation nubmers out of order.
     env.pageserver.allowed_errors.append(".*Generation .+ is less than existing .+")
 
-    # Our multiple creation requests will advance generation quickly, and when we skip
-    # a generation number we can generate these warnings
-    env.pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates for tenant .+")
-
     # Timeline::flush_and_shutdown cannot tell if it is hitting a failure because of
     # an incomplete attach, or some other problem.  In the field this should be rare,
     # so we allow it to log at WARN, even if it is occasionally a false positive.
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index f47356839c26..5e9a42f6b41e 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -152,10 +152,12 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
 
     client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
 
+    size_limit_mb = 30
+
     endpoint_main = env.endpoints.create(
         "test_timeline_size_quota_on_startup",
         # Set small limit for the test
-        config_lines=["neon.max_cluster_size=30MB"],
+        config_lines=[f"neon.max_cluster_size={size_limit_mb}MB"],
     )
     endpoint_main.start()
 
@@ -165,17 +167,39 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
 
             # Insert many rows. This query must fail because of space limit
             try:
-                for _i in range(5000):
-                    cur.execute(
-                        """
-                        INSERT INTO foo
-                            SELECT 'long string to consume some space' || g
-                            FROM generate_series(1, 100) g
-                    """
-                    )
 
-                # If we get here, the timeline size limit failed
-                log.error("Query unexpectedly succeeded")
+                def write_rows(count):
+                    for _i in range(count):
+                        cur.execute(
+                            """
+                            INSERT INTO foo
+                                SELECT 'long string to consume some space' || g
+                                FROM generate_series(1, 100) g
+                        """
+                        )
+
+                # Write some data that exceeds limit, then let the pageserver ingest it to guarantee that some feedback has made it to
+                # the safekeeper, then try to write some more.  We expect either the initial writes or the ones after
+                # the wait_for_last_flush_lsn to generate an exception.
+                #
+                # Without the wait_for_last_flush_lsn, the size limit sometimes isn't enforced (see https://github.com/neondatabase/neon/issues/6562)
+                write_rows(2500)
+                wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+                logical_size = env.pageserver.http_client().timeline_detail(
+                    env.initial_tenant, new_timeline_id
+                )["current_logical_size"]
+                assert logical_size > size_limit_mb * 1024 * 1024
+                write_rows(2500)
+
+                # If we get here, the timeline size limit failed.  Find out from the pageserver how large it
+                # thinks the timeline is.
+                wait_for_last_flush_lsn(env, endpoint_main, env.initial_tenant, new_timeline_id)
+                logical_size = env.pageserver.http_client().timeline_detail(
+                    env.initial_tenant, new_timeline_id
+                )["current_logical_size"]
+                log.error(
+                    f"Query unexpectedly succeeded, pageserver logical size is {logical_size}"
+                )
                 raise AssertionError()
 
             except psycopg2.errors.DiskFull as err:
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index ac1a3bef67bd..7efd86e3497d 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1725,7 +1725,10 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
 
 
 # Basic pull_timeline test.
-def test_pull_timeline(neon_env_builder: NeonEnvBuilder):
+# When live_sk_change is False, compute is restarted to change set of
+# safekeepers; otherwise it is live reload.
+@pytest.mark.parametrize("live_sk_change", [False, True])
+def test_pull_timeline(neon_env_builder: NeonEnvBuilder, live_sk_change: bool):
     neon_env_builder.auth_enabled = True
 
     def execute_payload(endpoint: Endpoint):
@@ -1758,8 +1761,7 @@ def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_i
     log.info("Use only first 3 safekeepers")
     env.safekeepers[3].stop()
     endpoint = env.endpoints.create("main")
-    endpoint.active_safekeepers = [1, 2, 3]
-    endpoint.start()
+    endpoint.start(safekeepers=[1, 2, 3])
 
     execute_payload(endpoint)
     show_statuses(env.safekeepers, tenant_id, timeline_id)
@@ -1771,29 +1773,22 @@ def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_i
     log.info("Initialize new safekeeper 4, pull data from 1 & 3")
     env.safekeepers[3].start()
 
-    res = (
-        env.safekeepers[3]
-        .http_client(auth_token=env.auth_keys.generate_safekeeper_token())
-        .pull_timeline(
-            {
-                "tenant_id": str(tenant_id),
-                "timeline_id": str(timeline_id),
-                "http_hosts": [
-                    f"http://localhost:{env.safekeepers[0].port.http}",
-                    f"http://localhost:{env.safekeepers[2].port.http}",
-                ],
-            }
-        )
+    res = env.safekeepers[3].pull_timeline(
+        [env.safekeepers[0], env.safekeepers[2]], tenant_id, timeline_id
     )
     log.info("Finished pulling timeline")
     log.info(res)
 
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
-    log.info("Restarting compute with new config to verify that it works")
-    endpoint.stop_and_destroy().create("main")
-    endpoint.active_safekeepers = [1, 3, 4]
-    endpoint.start()
+    action = "reconfiguing" if live_sk_change else "restarting"
+    log.info(f"{action} compute with new config to verify that it works")
+    new_sks = [1, 3, 4]
+    if not live_sk_change:
+        endpoint.stop_and_destroy().create("main")
+        endpoint.start(safekeepers=new_sks)
+    else:
+        endpoint.reconfigure(safekeepers=new_sks)
 
     execute_payload(endpoint)
     show_statuses(env.safekeepers, tenant_id, timeline_id)
@@ -2196,24 +2191,25 @@ def test_s3_eviction(
 ):
     neon_env_builder.num_safekeepers = 3
     neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS)
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "checkpoint_timeout": "100ms",
-        }
-    )
 
-    extra_opts = [
+    neon_env_builder.safekeeper_extra_opts = [
         "--enable-offload",
         "--partial-backup-timeout",
         "50ms",
         "--control-file-save-interval",
         "1s",
+        # Safekeepers usually wait a while before evicting something: for this test we want them to
+        # evict things as soon as they are inactive.
+        "--eviction-min-resident=100ms",
     ]
     if delete_offloaded_wal:
-        extra_opts.append("--delete-offloaded-wal")
+        neon_env_builder.safekeeper_extra_opts.append("--delete-offloaded-wal")
 
-    for sk in env.safekeepers:
-        sk.stop().start(extra_opts=extra_opts)
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "checkpoint_timeout": "100ms",
+        }
+    )
 
     n_timelines = 5
 
@@ -2268,7 +2264,7 @@ def test_s3_eviction(
         # restarting random safekeepers
         for sk in env.safekeepers:
             if random.random() < restart_chance:
-                sk.stop().start(extra_opts=extra_opts)
+                sk.stop().start()
         time.sleep(0.5)
 
     # require at least one successful eviction in at least one safekeeper
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 7845c122d51d..ad73770c446e 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 7845c122d51d3ebb547a984a640ac0310a2fadce
+Subproject commit ad73770c446ea361f43e4f0404798b7e5e7a62d8
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 2ff5ecc67c64..4874c8e52ed3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 2ff5ecc67c64e5fe44b7dde598e64e4538e0c373
+Subproject commit 4874c8e52ed349a9f8290bbdcd91eb92677a5d24
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index d55e0aca104a..b810fdfcbb59 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit d55e0aca104af0b611cf5565f1033b2acd2dcc1c
+Subproject commit b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2
diff --git a/vendor/revisions.json b/vendor/revisions.json
index e755cf2e9dfa..da49ff19c3ec 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,5 +1,5 @@
 {
-  "v16": ["16.3", "d55e0aca104af0b611cf5565f1033b2acd2dcc1c"],
-  "v15": ["15.7", "2ff5ecc67c64e5fe44b7dde598e64e4538e0c373"],
-  "v14": ["14.12", "7845c122d51d3ebb547a984a640ac0310a2fadce"]
+  "v16": ["16.3", "b810fdfcbb59afea7ea7bbe0cf94eaccb55a2ea2"],
+  "v15": ["15.7", "4874c8e52ed349a9f8290bbdcd91eb92677a5d24"],
+  "v14": ["14.12", "ad73770c446ea361f43e4f0404798b7e5e7a62d8"]
 }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index f43076171f21..832fe06bf697 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -30,13 +30,12 @@ chrono = { version = "0.4", default-features = false, features = ["clock", "serd
 clap = { version = "4", features = ["derive", "string"] }
 clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
 crossbeam-utils = { version = "0.8" }
+deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 futures-channel = { version = "0.3", features = ["sink"] }
-futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
-futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 hashbrown = { version = "0.14", features = ["raw"] }
@@ -69,6 +68,7 @@ sha2 = { version = "0.10", features = ["asm"] }
 smallvec = { version = "1", default-features = false, features = ["const_new", "write"] }
 subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
+tikv-jemalloc-sys = { version = "0.5" }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-rustls = { version = "0.24" }
@@ -106,7 +106,9 @@ num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { git = "https://github.com/apache/arrow-rs", branch = "master", default-features = false, features = ["zstd"] }
+proc-macro2 = { version = "1" }
 prost = { version = "0.11" }
+quote = { version = "1" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }