diff --git a/.codespellrc b/.codespellrc new file mode 100644 index 00000000..f3f65c4d --- /dev/null +++ b/.codespellrc @@ -0,0 +1,6 @@ +[codespell] +# Ref: https://github.com/codespell-project/codespell#using-a-config-file +skip = .git*,*.svg,*.lock,*.css,.codespellrc +check-hidden = true +ignore-regex = ^\s*"image/\S+": ".*|\bND\b +ignore-words-list = crate diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..a894e29e --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.ipynb linguist-detectable=false diff --git a/.github/dependabot.yml b/.github/dependabot.yml index cc0cdfd8..f226efb9 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -18,3 +18,7 @@ updates: day: "monday" time: "05:00" timezone: "US/Pacific" + groups: + actions: + patterns: + - "*" diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml new file mode 100644 index 00000000..c59e0473 --- /dev/null +++ b/.github/workflows/codespell.yml @@ -0,0 +1,25 @@ +# Codespell configuration is within .codespellrc +--- +name: Codespell + +on: + push: + branches: [main] + pull_request: + branches: [main] + +permissions: + contents: read + +jobs: + codespell: + name: Check for spelling errors + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Annotate locations with typos + uses: codespell-project/codespell-problem-matcher@v1 + - name: Codespell + uses: codespell-project/actions-codespell@v2 diff --git a/.github/workflows/deploy-docs.yaml b/.github/workflows/deploy-docs.yaml new file mode 100644 index 00000000..8e362383 --- /dev/null +++ b/.github/workflows/deploy-docs.yaml @@ -0,0 +1,67 @@ +name: deploy-docs +on: + workflow_dispatch: + push: + branches: + - main +permissions: + contents: write +jobs: + deploy: + defaults: + run: + working-directory: ./docs + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Configure Git Credentials + run: | + git config user.name github-actions[bot] + git config user.email 41898282+github-actions[bot]@users.noreply.github.com + + - uses: actions/setup-python@v5 + with: + python-version: 3.x + + - name: Load cached Poetry installation + id: cached-poetry + uses: actions/cache@v4 + with: + path: ~/.local # the path depends on the OS + key: poetry-0 # increment to reset cache + + - name: Install Poetry + if: steps.cached-poetry.outputs.cache-hit != 'true' + uses: snok/install-poetry@v1 + with: + virtualenvs-create: true + virtualenvs-in-project: true + virtualenvs-path: .venv + installer-parallel: true + + - name: Load cached venv + id: cached-poetry-dependencies + uses: actions/cache@v4 + with: + path: .venv + key: venv-${{ runner.os }}-${{ steps.setup-python.outputs.python-version }}-${{ hashFiles('**/poetry.lock') }} + + - name: Install dependencies (with cache) + if: steps.cached-poetry-dependencies.outputs.cache-hit != 'true' + run: poetry install --no-interaction --no-root + + - name: Install project + run: poetry install --no-interaction + + - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV + + - name: Setup Cache for MkDocs + uses: actions/cache@v4 + with: + key: mkdocs-material-${{ env.cache_id }} + path: .cache + restore-keys: | + mkdocs-material- + + - name: Deploy to gh-pages + run: poetry run mkdocs gh-deploy --force \ No newline at end of file diff --git a/.github/workflows/publish-rust-library.yml b/.github/workflows/publish-rust-library.yml new file mode 100644 index 00000000..26afd852 --- /dev/null +++ b/.github/workflows/publish-rust-library.yml @@ -0,0 +1,64 @@ +name: publish rust library + +env: + CARGO_INCREMENTAL: 0 + CARGO_NET_RETRY: 10 + CI: 1 + RUST_BACKTRACE: short + RUSTFLAGS: "-D warnings -W unreachable-pub -W bare-trait-objects" + RUSTUP_MAX_RETRIES: 10 + RUST_CHANNEL: 'stable' + +on: + workflow_dispatch: + +jobs: + publish: + name: Check and publish + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Stand up MinIO + run: | + docker compose up -d minio + - name: Wait for MinIO to be ready + run: | + for i in {1..10}; do + if curl --silent --fail http://minio:9000/minio/health/live; then + break + fi + sleep 3 + done + docker compose exec -T minio mc alias set minio http://minio:9000 minio123 minio123 + + - name: Install Just + run: sudo snap install --edge --classic just + + - name: Install Rust toolchain + run: | + rustup update --no-self-update ${{ env.RUST_CHANNEL }} + rustup component add --toolchain ${{ env.RUST_CHANNEL }} rustfmt rust-src clippy + rustup default ${{ env.RUST_CHANNEL }} + + - name: Cache Dependencies + uses: Swatinem/rust-cache@v2 + with: + # workspaces: "rust -> target" + key: ${{ env.RUST_CHANNEL }} + + - name: Install cargo-deny + run: cargo install --locked cargo-deny + + - name: Check + if: matrix.os == 'ubuntu-latest' || github.event_name == 'push' + run: | + just pre-commit + + - name: Install cargo-release + run: cargo install --locked cargo-release + + - name: publish to crates + run: cargo release --tag-prefix icechunk-rust --execute + env: + CARGO_REGISTRY_TOKEN: ${{ secrets.CARGO_REGISTRY_TOKEN }} diff --git a/.github/workflows/python-check.yaml b/.github/workflows/python-check.yaml new file mode 100644 index 00000000..83d57f00 --- /dev/null +++ b/.github/workflows/python-check.yaml @@ -0,0 +1,95 @@ +name: Python Check + +on: + push: + branches: + - main + pull_request: + paths: + - 'icechunk/**' + - 'icechunk-python/**' + - '.github/workflows/python-check.yaml' + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + +defaults: + run: + working-directory: ./icechunk-python + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Stand up MinIO + run: | + docker compose up -d minio + - name: Wait for MinIO to be ready + run: | + for _ in {1..10}; do + if curl --silent --fail http://minio:9000/minio/health/live; then + break + fi + sleep 3 + done + docker compose exec -T minio mc alias set minio http://minio:9000 minio123 minio123 + - uses: actions/setup-python@v5 + with: + python-version: '3.11' + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + working-directory: icechunk-python + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + manylinux: ${{ matrix.platform.manylinux }} # https://github.com/PyO3/maturin-action/issues/245 + - name: mypy + shell: bash + working-directory: icechunk-python + run: | + set -e + python3 -m venv .venv + source .venv/bin/activate + pip install icechunk['test'] --find-links dist --force-reinstall + mypy python + - name: ruff + shell: bash + working-directory: icechunk-python + run: | + set -e + python3 -m venv .venv + source .venv/bin/activate + pip install icechunk['test'] --find-links dist --force-reinstall + ruff check + - name: Restore cached hypothesis directory + id: restore-hypothesis-cache + uses: actions/cache/restore@v4 + with: + path: icechunk-python/.hypothesis/ + key: cache-hypothesis-${{ runner.os }}-${{ github.run_id }} + restore-keys: | + cache-hypothesis- + - name: pytest + shell: bash + working-directory: icechunk-python + run: | + set -e + python3 -m venv .venv + source .venv/bin/activate + pip install icechunk['test'] --find-links dist --force-reinstall + pytest + # explicitly save the cache so it gets updated, also do this even if it fails. + - name: Save cached hypothesis directory + id: save-hypothesis-cache + if: always() + uses: actions/cache/save@v4 + with: + path: icechunk-python/.hypothesis/ + key: cache-hypothesis-${{ runner.os }}-${{ github.run_id }} diff --git a/.github/workflows/python-ci.yaml b/.github/workflows/python-ci.yaml index 58082ec2..97f38b15 100644 --- a/.github/workflows/python-ci.yaml +++ b/.github/workflows/python-ci.yaml @@ -6,14 +6,14 @@ name: Python CI on: - push: - branches: - - main - - master - tags: - - '*' - pull_request: workflow_dispatch: + schedule: + # run every day at 4am + - cron: '0 4 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true permissions: contents: read @@ -30,16 +30,15 @@ jobs: platform: - runner: ubuntu-latest target: x86_64 + manylinux: auto # - runner: ubuntu-latest # target: x86 - # - runner: ubuntu-latest - # target: aarch64 - # - runner: ubuntu-latest - # target: armv7 - # - runner: ubuntu-latest - # target: s390x - # - runner: ubuntu-latest - # target: ppc64le + - runner: ubuntu-latest + target: aarch64 + manylinux: 2_28 + - runner: ubuntu-latest + target: armv7 + manylinux: 2_28 steps: - uses: actions/checkout@v4 - name: Stand up MinIO @@ -56,7 +55,7 @@ jobs: docker compose exec -T minio mc alias set minio http://minio:9000 minio123 minio123 - uses: actions/setup-python@v5 with: - python-version: 3.x + python-version: '3.11' - name: Build wheels uses: PyO3/maturin-action@v1 with: @@ -64,207 +63,100 @@ jobs: target: ${{ matrix.platform.target }} args: --release --out dist --find-interpreter sccache: 'true' - manylinux: auto + manylinux: ${{ matrix.platform.manylinux }} # https://github.com/PyO3/maturin-action/issues/245 - name: Upload wheels uses: actions/upload-artifact@v4 with: working-directory: icechunk-python name: wheels-linux-${{ matrix.platform.target }} - path: dist - # https://github.com/actions/cache/blob/main/tips-and-workarounds.md#update-a-cache - - name: Restore cached hypothesis directory - id: restore-hypothesis-cache - uses: actions/cache/restore@v4 + path: icechunk-python/dist + + musllinux: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: ubuntu-latest + target: x86_64 + - runner: ubuntu-latest + target: x86 + - runner: ubuntu-latest + target: aarch64 + - runner: ubuntu-latest + target: armv7 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 with: - path: icechunk-python/.hypothesis/ - key: cache-hypothesis-${{ runner.os }}-${{ github.run_id }} - restore-keys: | - cache-hypothesis- - - name: mypy - shell: bash - working-directory: icechunk-python - run: | - set -e - python3 -m venv .venv - source .venv/bin/activate - pip install icechunk['test'] --find-links dist --force-reinstall - mypy python - - name: ruff - shell: bash - working-directory: icechunk-python - run: | - set -e - python3 -m venv .venv - source .venv/bin/activate - pip install icechunk['test'] --find-links dist --force-reinstall - ruff check - - name: pytest - if: ${{ startsWith(matrix.platform.target, 'x86_64') }} - shell: bash - working-directory: icechunk-python - run: | - set -e - python3 -m venv .venv - source .venv/bin/activate - pip install icechunk['test'] --find-links dist --force-reinstall - AWS_ALLOW_HTTP=1 AWS_ENDPOINT_URL=http://localhost:9000 AWS_ACCESS_KEY_ID=minio123 AWS_SECRET_ACCESS_KEY=minio123 pytest - - name: pytest - if: ${{ !startsWith(matrix.platform.target, 'x86') && matrix.platform.target != 'ppc64' }} - uses: uraimo/run-on-arch-action@v2 + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 with: working-directory: icechunk-python - arch: ${{ matrix.platform.target }} - distro: ubuntu22.04 - githubToken: ${{ github.token }} - install: | - apt-get update - apt-get install -y --no-install-recommends python3 python3-pip - pip3 install -U pip pytest - run: | - set -e - pip3 install icechunk['test'] --find-links dist --force-reinstall - AWS_ALLOW_HTTP=1 AWS_ENDPOINT_URL=http://localhost:9000 AWS_ACCESS_KEY_ID=minio123 AWS_SECRET_ACCESS_KEY=minio123 pytest - - # explicitly save the cache so it gets updated, also do this even if it fails. - - name: Save cached hypothesis directory - id: save-hypothesis-cache - if: always() - uses: actions/cache/save@v4 + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + manylinux: musllinux_1_2 + - name: Upload wheels + uses: actions/upload-artifact@v4 with: - path: icehunk-python/.hypothesis/ - key: cache-hypothesis-${{ runner.os }}-${{ github.run_id }} - - # musllinux: - # runs-on: ${{ matrix.platform.runner }} - # strategy: - # matrix: - # platform: - # - runner: ubuntu-latest - # target: x86_64 - # - runner: ubuntu-latest - # target: x86 - # - runner: ubuntu-latest - # target: aarch64 - # - runner: ubuntu-latest - # target: armv7 - # steps: - # - uses: actions/checkout@v4 - # - uses: actions/setup-python@v5 - # with: - # python-version: 3.x - # - name: Build wheels - # uses: PyO3/maturin-action@v1 - # with: - # target: ${{ matrix.platform.target }} - # args: --release --out dist --find-interpreter - # sccache: 'true' - # manylinux: musllinux_1_2 - # - name: Upload wheels - # uses: actions/upload-artifact@v4 - # with: - # name: wheels-musllinux-${{ matrix.platform.target }} - # path: dist - # - name: pytest - # if: ${{ startsWith(matrix.platform.target, 'x86_64') }} - # uses: addnab/docker-run-action@v3 - # with: - # image: alpine:latest - # options: -v ${{ github.workspace }}:/io -w /io - # run: | - # set -e - # apk add py3-pip py3-virtualenv - # python3 -m virtualenv .venv - # source .venv/bin/activate - # pip install icechunk --no-index --find-links dist --force-reinstall - # pip install pytest - # pytest - # - name: pytest - # if: ${{ !startsWith(matrix.platform.target, 'x86') }} - # uses: uraimo/run-on-arch-action@v2 - # with: - # arch: ${{ matrix.platform.target }} - # distro: alpine_latest - # githubToken: ${{ github.token }} - # install: | - # apk add py3-virtualenv - # run: | - # set -e - # python3 -m virtualenv .venv - # source .venv/bin/activate - # pip install pytest - # pip install icechunk --find-links dist --force-reinstall - # pytest + name: wheels-musllinux-${{ matrix.platform.target }} + path: icechunk-python/dist - # windows: - # runs-on: ${{ matrix.platform.runner }} - # strategy: - # matrix: - # platform: - # - runner: windows-latest - # target: x64 - # - runner: windows-latest - # target: x86 - # steps: - # - uses: actions/checkout@v4 - # - uses: actions/setup-python@v5 - # with: - # python-version: 3.x - # architecture: ${{ matrix.platform.target }} - # - name: Build wheels - # uses: PyO3/maturin-action@v1 - # with: - # target: ${{ matrix.platform.target }} - # args: --release --out dist --find-interpreter - # sccache: 'true' - # - name: Upload wheels - # uses: actions/upload-artifact@v4 - # with: - # name: wheels-windows-${{ matrix.platform.target }} - # path: dist - # - name: pytest - # if: ${{ !startsWith(matrix.platform.target, 'aarch64') }} - # shell: bash - # run: | - # set -e - # python3 -m venv .venv - # source .venv/Scripts/activate - # pip install icechunk --find-links dist --force-reinstall - # pip install pytest - # pytest + windows: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: windows-latest + target: x64 + - runner: windows-latest + target: x86 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + architecture: ${{ matrix.platform.target }} + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + working-directory: icechunk-python + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-windows-${{ matrix.platform.target }} + path: icechunk-python/dist - # macos: - # runs-on: ${{ matrix.platform.runner }} - # strategy: - # matrix: - # platform: - # - runner: macos-12 - # target: x86_64 - # - runner: macos-14 - # target: aarch64 - # steps: - # - uses: actions/checkout@v4 - # - uses: actions/setup-python@v5 - # with: - # python-version: 3.x - # - name: Build wheels - # uses: PyO3/maturin-action@v1 - # with: - # target: ${{ matrix.platform.target }} - # args: --release --out dist --find-interpreter - # sccache: 'true' - # - name: Upload wheels - # uses: actions/upload-artifact@v4 - # with: - # name: wheels-macos-${{ matrix.platform.target }} - # path: dist - # - name: pytest - # run: | - # set -e - # python3 -m venv .venv - # source .venv/bin/activate - # pip install icechunk --find-links dist --force-reinstall - # pip install pytest - # pytest + macos: + runs-on: ${{ matrix.platform.runner }} + strategy: + matrix: + platform: + - runner: macos-12 + target: x86_64 + - runner: macos-14 + target: aarch64 + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: 3.x + - name: Build wheels + uses: PyO3/maturin-action@v1 + with: + working-directory: icechunk-python + target: ${{ matrix.platform.target }} + args: --release --out dist --find-interpreter + sccache: 'true' + - name: Upload wheels + uses: actions/upload-artifact@v4 + with: + name: wheels-macos-${{ matrix.platform.target }} + path: icechunk-python/dist sdist: runs-on: ubuntu-latest @@ -279,21 +171,20 @@ jobs: - name: Upload sdist uses: actions/upload-artifact@v4 with: - working-directory: icechunk-python name: wheels-sdist - path: dist + path: icechunk-python/dist - # release: - # name: Release - # runs-on: ubuntu-latest - # if: "startsWith(github.ref, 'refs/tags/')" - # needs: [linux, musllinux, windows, macos, sdist] - # steps: - # - uses: actions/download-artifact@v4 - # - name: Publish to PyPI - # uses: PyO3/maturin-action@v1 - # env: - # MATURIN_PYPI_TOKEN: ${{ secrets.PYPI_API_TOKEN }} - # with: - # command: upload - # args: --non-interactive --skip-existing wheels-*/* + release: + name: Release + runs-on: ubuntu-latest + permissions: + id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + if: ${{ github.event_name == 'workflow_dispatch' }} + needs: [linux, musllinux, windows, macos, sdist] + steps: + - uses: actions/download-artifact@v4 + - name: Publish to PyPI + uses: PyO3/maturin-action@v1 + with: + command: upload + args: --non-interactive --skip-existing wheels-*/* diff --git a/.github/workflows/rust-ci.yaml b/.github/workflows/rust-ci.yaml index bcbe8d62..abb81bf0 100644 --- a/.github/workflows/rust-ci.yaml +++ b/.github/workflows/rust-ci.yaml @@ -5,10 +5,17 @@ name: Rust CI on: pull_request: types: [opened, reopened, synchronize, labeled] + paths: + - 'icechunk/**' + - '.github/workflows/rust-ci.yaml' push: branches: - main +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + env: CARGO_INCREMENTAL: 0 CARGO_NET_RETRY: 10 @@ -78,11 +85,5 @@ jobs: - name: Check if: matrix.os == 'ubuntu-latest' || github.event_name == 'push' - env: - AWS_ACCESS_KEY_ID: minio123 - AWS_SECRET_ACCESS_KEY: minio123 - AWS_ALLOW_HTTP: 1 - AWS_ENDPOINT_URL: http://localhost:9000 - AWS_DEFAULT_REGION: "us-east-1" run: | just pre-commit diff --git a/.gitignore b/.gitignore index 7ec47617..e43a1f53 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,7 @@ /target # macOS -.DS_Store \ No newline at end of file +.DS_Store + +# Docs build +.docs \ No newline at end of file diff --git a/.mailmap b/.mailmap new file mode 100644 index 00000000..28222ca5 --- /dev/null +++ b/.mailmap @@ -0,0 +1,2 @@ +Sebastián Galkin +Sebastián Galkin diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c770228b..f9cac9d1 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -7,10 +7,14 @@ repos: entry: just pre-commit language: system pass_filenames: false - - repo: https://github.com/rhysd/actionlint rev: v1.6.26 hooks: - id: actionlint files: ".github/workflows/" args: ["-ignore", "SC1090", "-ignore", "SC2046", "-ignore", "SC2086", "-ignore", "SC2129", "-ignore", "SC2155"] + - repo: https://github.com/codespell-project/codespell + # Configuration for codespell is in .codespellrc + rev: v2.3.0 + hooks: + - id: codespell diff --git a/Cargo.lock b/Cargo.lock index fcdc1bdf..7d268927 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,6 +30,12 @@ dependencies = [ "zerocopy 0.7.35", ] +[[package]] +name = "allocator-api2" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" + [[package]] name = "android-tzdata" version = "0.1.1" @@ -80,27 +86,396 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.82" +version = "0.1.83" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a27b8a3a6e1a44fa4c8baf1f653e4172e81486d4941f2237e20dc2d0cf4ddff1" +checksum = "721cae7de5c34fbb2acd27e21e6d2cf7b886dce0c27388d46c4e6c47ea4318dd" dependencies = [ "proc-macro2", "quote", "syn", ] -[[package]] -name = "atomic-waker" -version = "1.1.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" - [[package]] name = "autocfg" version = "1.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" +[[package]] +name = "aws-config" +version = "1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8191fb3091fa0561d1379ef80333c3c7191c6f0435d986e85821bcf7acbd1126" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sdk-sso", + "aws-sdk-ssooidc", + "aws-sdk-sts", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "hex", + "http 0.2.12", + "ring", + "time", + "tokio", + "tracing", + "url", + "zeroize", +] + +[[package]] +name = "aws-credential-types" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60e8f6b615cb5fc60a98132268508ad104310f0cfb25a1c22eee76efdf9154da" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "zeroize", +] + +[[package]] +name = "aws-runtime" +version = "1.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a10d5c055aa540164d9561a0e2e74ad30f0dcf7393c3a92f6733ddf9c5762468" +dependencies = [ + "aws-credential-types", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http-body 0.4.6", + "once_cell", + "percent-encoding", + "pin-project-lite", + "tracing", + "uuid", +] + +[[package]] +name = "aws-sdk-s3" +version = "1.53.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43fad71130014e11f42fadbdcce5df12ee61866f8ab9bad773b138d4b3c11087" +dependencies = [ + "ahash", + "aws-credential-types", + "aws-runtime", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-checksums", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "bytes", + "fastrand", + "hex", + "hmac", + "http 0.2.12", + "http-body 0.4.6", + "lru", + "once_cell", + "percent-encoding", + "regex-lite", + "sha2", + "tracing", + "url", +] + +[[package]] +name = "aws-sdk-sso" +version = "1.44.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b90cfe6504115e13c41d3ea90286ede5aa14da294f3fe077027a6e83850843c" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "http 0.2.12", + "once_cell", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-ssooidc" +version = "1.45.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "167c0fad1f212952084137308359e8e4c4724d1c643038ce163f06de9662c1d0" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "http 0.2.12", + "once_cell", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-sts" +version = "1.44.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2cb5f98188ec1435b68097daa2a37d74b9d17c9caa799466338a8d1544e71b9d" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "http 0.2.12", + "once_cell", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sigv4" +version = "1.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc8db6904450bafe7473c6ca9123f88cc11089e41a025408f992db4e22d3be68" +dependencies = [ + "aws-credential-types", + "aws-smithy-eventstream", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "crypto-bigint 0.5.5", + "form_urlencoded", + "hex", + "hmac", + "http 0.2.12", + "http 1.1.0", + "once_cell", + "p256", + "percent-encoding", + "ring", + "sha2", + "subtle", + "time", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-async" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62220bc6e97f946ddd51b5f1361f78996e704677afc518a4ff66b7a72ea1378c" +dependencies = [ + "futures-util", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "aws-smithy-checksums" +version = "0.60.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "598b1689d001c4d4dc3cb386adb07d37786783aee3ac4b324bcadac116bf3d23" +dependencies = [ + "aws-smithy-http", + "aws-smithy-types", + "bytes", + "crc32c", + "crc32fast", + "hex", + "http 0.2.12", + "http-body 0.4.6", + "md-5", + "pin-project-lite", + "sha1", + "sha2", + "tracing", +] + +[[package]] +name = "aws-smithy-eventstream" +version = "0.60.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cef7d0a272725f87e51ba2bf89f8c21e4df61b9e49ae1ac367a6d69916ef7c90" +dependencies = [ + "aws-smithy-types", + "bytes", + "crc32fast", +] + +[[package]] +name = "aws-smithy-http" +version = "0.60.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c8bc3e8fdc6b8d07d976e301c02fe553f72a39b7a9fea820e023268467d7ab6" +dependencies = [ + "aws-smithy-eventstream", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "bytes-utils", + "futures-core", + "http 0.2.12", + "http-body 0.4.6", + "once_cell", + "percent-encoding", + "pin-project-lite", + "pin-utils", + "tracing", +] + +[[package]] +name = "aws-smithy-json" +version = "0.60.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" +dependencies = [ + "aws-smithy-types", +] + +[[package]] +name = "aws-smithy-query" +version = "0.60.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2fbd61ceb3fe8a1cb7352e42689cec5335833cd9f94103a61e98f9bb61c64bb" +dependencies = [ + "aws-smithy-types", + "urlencoding", +] + +[[package]] +name = "aws-smithy-runtime" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1ce695746394772e7000b39fe073095db6d45a862d0767dd5ad0ac0d7f8eb87" +dependencies = [ + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "fastrand", + "h2", + "http 0.2.12", + "http-body 0.4.6", + "http-body 1.0.1", + "httparse", + "hyper", + "hyper-rustls", + "once_cell", + "pin-project-lite", + "pin-utils", + "rustls", + "tokio", + "tracing", +] + +[[package]] +name = "aws-smithy-runtime-api" +version = "1.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e086682a53d3aa241192aa110fa8dfce98f2f5ac2ead0de84d41582c7e8fdb96" +dependencies = [ + "aws-smithy-async", + "aws-smithy-types", + "bytes", + "http 0.2.12", + "http 1.1.0", + "pin-project-lite", + "tokio", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-types" +version = "1.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147100a7bea70fa20ef224a6bad700358305f5dc0f84649c53769761395b355b" +dependencies = [ + "base64-simd", + "bytes", + "bytes-utils", + "futures-core", + "http 0.2.12", + "http 1.1.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "itoa", + "num-integer", + "pin-project-lite", + "pin-utils", + "ryu", + "serde", + "time", + "tokio", + "tokio-util", +] + +[[package]] +name = "aws-smithy-xml" +version = "0.60.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab0b0166827aa700d3dc519f72f8b3a91c35d0b8d042dc5d643a91e6f80648fc" +dependencies = [ + "xmlparser", +] + +[[package]] +name = "aws-types" +version = "1.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5221b91b3e441e6675310829fd8984801b772cb1546ef6c0e54dec9f1ac13fef" +dependencies = [ + "aws-credential-types", + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "rustc_version", + "tracing", +] + [[package]] name = "backtrace" version = "0.3.73" @@ -116,18 +491,46 @@ dependencies = [ "rustc-demangle", ] +[[package]] +name = "base16ct" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "349a06037c7bf932dd7e7d1f653678b2038b9ad46a74102f1fc7bd7872678cce" + [[package]] name = "base32" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "022dfe9eb35f19ebbcb51e0b40a5ab759f46ad60cadf7297e0bd085afb50e076" +[[package]] +name = "base64" +version = "0.21.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567" + [[package]] name = "base64" version = "0.22.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" +[[package]] +name = "base64-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] + +[[package]] +name = "base64ct" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c3c1a368f70d6cf7302d78f8f7093da241fb8e8807c05cc9e51a125895a6d5b" + [[package]] name = "bit-set" version = "0.5.3" @@ -179,6 +582,16 @@ dependencies = [ "serde", ] +[[package]] +name = "bytes-utils" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" +dependencies = [ + "bytes", + "either", +] + [[package]] name = "cc" version = "1.1.7" @@ -203,9 +616,15 @@ dependencies = [ "num-traits", "serde", "wasm-bindgen", - "windows-targets 0.52.6", + "windows-targets", ] +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + [[package]] name = "core-foundation" version = "0.9.4" @@ -222,6 +641,55 @@ version = "0.8.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ea2b9bc92be3c2baa9334a323ebca2d6f074ff852cd1d7b11064035cd3868f" +[[package]] +name = "cpufeatures" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "608697df725056feaccfa42cffdaeeec3fccc4ffc38358ecd19b243e716a78e0" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32c" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crypto-bigint" +version = "0.4.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef2b4b23cddf68b89b8f8069890e8c270d54e2d5fe1b143820234805e4cb17ef" +dependencies = [ + "generic-array", + "rand_core", + "subtle", + "zeroize", +] + +[[package]] +name = "crypto-bigint" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dc92fb57ca44df6db8059111ab3af99a63d5d0f8375d9972e319a379c6bab76" +dependencies = [ + "rand_core", + "subtle", +] + [[package]] name = "crypto-common" version = "0.1.6" @@ -267,6 +735,16 @@ dependencies = [ "syn", ] +[[package]] +name = "der" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1a467a65c5e759bce6e65eaf91cc29f466cdc57cb65777bd646872a8a1fd4de" +dependencies = [ + "const-oid", + "zeroize", +] + [[package]] name = "deranged" version = "0.3.11" @@ -291,6 +769,19 @@ checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" dependencies = [ "block-buffer", "crypto-common", + "subtle", +] + +[[package]] +name = "ecdsa" +version = "0.14.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413301934810f597c1d19ca71c8710e99a3f1ba28a0d2ebc01551a2daeea3c5c" +dependencies = [ + "der", + "elliptic-curve", + "rfc6979", + "signature", ] [[package]] @@ -299,6 +790,26 @@ version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" +[[package]] +name = "elliptic-curve" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7bb888ab5300a19b8e5bceef25ac745ad065f3c9f7efc6de1b91958110891d3" +dependencies = [ + "base16ct", + "crypto-bigint 0.4.9", + "der", + "digest", + "ff", + "generic-array", + "group", + "pkcs8", + "rand_core", + "sec1", + "subtle", + "zeroize", +] + [[package]] name = "equivalent" version = "1.0.1" @@ -317,9 +828,19 @@ dependencies = [ [[package]] name = "fastrand" -version = "2.1.0" +version = "2.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fc0510504f03c51ada170672ac806f1f105a88aa97a5281117e1ddc3368e51a" +checksum = "e8c02a5121d4ea3eb16a80748c74f5549a5665e4c21333c6098f283870fbdea6" + +[[package]] +name = "ff" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d013fc25338cc558c5c2cfbad646908fb23591e2404481826742b651c9af7160" +dependencies = [ + "rand_core", + "subtle", +] [[package]] name = "fnv" @@ -353,9 +874,9 @@ dependencies = [ [[package]] name = "futures-channel" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eac8f7d7865dcb88bd4373ab671c8cf4508703796caa2b1985a9ca867b3fcb78" +checksum = "2dff15bf788c671c1934e366d07e30c1814a8ef514e1af724a602e8a2fbe1b10" dependencies = [ "futures-core", "futures-sink", @@ -363,9 +884,9 @@ dependencies = [ [[package]] name = "futures-core" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dfc6580bb841c5a68e9ef15c77ccc837b40a7504914d52e47b8b0e9bbda25a1d" +checksum = "05f29059c0c2090612e8d742178b0580d2dc940c837851ad723096f87af6663e" [[package]] name = "futures-executor" @@ -380,15 +901,15 @@ dependencies = [ [[package]] name = "futures-io" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a44623e20b9681a318efdd71c299b6b222ed6f231972bfe2f224ebad6311f0c1" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" [[package]] name = "futures-macro" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" +checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650" dependencies = [ "proc-macro2", "quote", @@ -397,21 +918,21 @@ dependencies = [ [[package]] name = "futures-sink" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9fb8e00e87438d937621c1c6269e53f536c14d3fbd6a042bb24879e57d474fb5" +checksum = "e575fab7d1e0dcb8d0c7bcf9a63ee213816ab51902e6d244a95819acacf1d4f7" [[package]] name = "futures-task" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38d84fa142264698cdce1a9f9172cf383a0c82de1bddcf3092901442c4097004" +checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988" [[package]] name = "futures-util" -version = "0.3.30" +version = "0.3.31" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3d6401deb83407ab3da39eba7e33987a73c3df0c82b4bb5813ee871c19c41d48" +checksum = "9fa08315bb612088cc391249efdc3bc77536f16c91f6cf495e6fbe85b20a4a81" dependencies = [ "futures-channel", "futures-core", @@ -452,18 +973,29 @@ version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40ecd4077b5ae9fd2e9e169b102c6c330d0605168eb0e8bf79952b256dbefffd" +[[package]] +name = "group" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5dfbfb3a6cfbd390d5c9564ab283a0349b9b9fcd46a706c1eb10e0db70bfbac7" +dependencies = [ + "ff", + "rand_core", + "subtle", +] + [[package]] name = "h2" -version = "0.4.5" +version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" dependencies = [ - "atomic-waker", "bytes", "fnv", "futures-core", "futures-sink", - "http", + "futures-util", + "http 0.2.12", "indexmap 2.2.6", "slab", "tokio", @@ -482,6 +1014,10 @@ name = "hashbrown" version = "0.14.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" +dependencies = [ + "ahash", + "allocator-api2", +] [[package]] name = "heck" @@ -507,6 +1043,26 @@ version = "0.4.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + [[package]] name = "http" version = "1.1.0" @@ -518,6 +1074,17 @@ dependencies = [ "itoa", ] +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http 0.2.12", + "pin-project-lite", +] + [[package]] name = "http-body" version = "1.0.1" @@ -525,7 +1092,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http", + "http 1.1.0", ] [[package]] @@ -536,8 +1103,8 @@ checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", "futures-util", - "http", - "http-body", + "http 1.1.0", + "http-body 1.0.1", "pin-project-lite", ] @@ -547,6 +1114,12 @@ version = "1.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0fcc0b4a115bf80b728eb8ea024ad5bd707b615bfed49e0665b6e0f86fd082d9" +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + [[package]] name = "humantime" version = "2.1.0" @@ -555,60 +1128,42 @@ checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" [[package]] name = "hyper" -version = "1.4.1" +version = "0.14.30" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" +checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9" dependencies = [ "bytes", "futures-channel", + "futures-core", "futures-util", "h2", - "http", - "http-body", + "http 0.2.12", + "http-body 0.4.6", "httparse", + "httpdate", "itoa", "pin-project-lite", - "smallvec", + "socket2", "tokio", + "tower-service", + "tracing", "want", ] [[package]] name = "hyper-rustls" -version = "0.27.2" +version = "0.24.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" +checksum = "ec3efd23720e2049821a693cbc7e65ea87c72f1c58ff2f9522ff332b1491e590" dependencies = [ "futures-util", - "http", + "http 0.2.12", "hyper", - "hyper-util", + "log", "rustls", "rustls-native-certs", - "rustls-pki-types", "tokio", "tokio-rustls", - "tower-service", -] - -[[package]] -name = "hyper-util" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9" -dependencies = [ - "bytes", - "futures-channel", - "futures-util", - "http", - "http-body", - "hyper", - "pin-project-lite", - "socket2", - "tokio", - "tower", - "tower-service", - "tracing", ] [[package]] @@ -636,13 +1191,16 @@ dependencies = [ [[package]] name = "icechunk" -version = "0.1.0" +version = "0.1.0-alpha.3" dependencies = [ "async-recursion", "async-stream", "async-trait", + "aws-config", + "aws-credential-types", + "aws-sdk-s3", "base32", - "base64", + "base64 0.22.1", "bytes", "chrono", "futures", @@ -654,6 +1212,7 @@ dependencies = [ "quick_cache", "rand", "rmp-serde", + "rmpv", "serde", "serde_json", "serde_with", @@ -661,12 +1220,13 @@ dependencies = [ "test-strategy", "thiserror", "tokio", + "typed-path", "url", ] [[package]] name = "icechunk-python" -version = "0.1.0" +version = "0.1.0-alpha.3" dependencies = [ "async-stream", "bytes", @@ -675,6 +1235,7 @@ dependencies = [ "icechunk", "pyo3", "pyo3-asyncio-0-21", + "serde_json", "thiserror", "tokio", ] @@ -723,12 +1284,6 @@ version = "2.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b248f5224d1d606005e02c97f5aa4e88eeb230488bcc03bc9ca4d7991399f2b5" -[[package]] -name = "ipnet" -version = "2.9.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" - [[package]] name = "itertools" version = "0.13.0" @@ -761,9 +1316,9 @@ checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" [[package]] name = "libc" -version = "0.2.155" +version = "0.2.159" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +checksum = "561d97a539a36e26a9a5fad1ea11a3039a67714694aaa379433e580854bc3dc5" [[package]] name = "libm" @@ -793,6 +1348,15 @@ version = "0.4.22" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7a70ba024b9dc04c27ea2f0c0548feb474ec5c54bba33a7f72f873a39d07b24" +[[package]] +name = "lru" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37ee39891760e7d94734f6f63fedc29a2e4a152f836120753a72503f09fcf904" +dependencies = [ + "hashbrown 0.14.5", +] + [[package]] name = "md-5" version = "0.10.6" @@ -818,12 +1382,6 @@ dependencies = [ "autocfg", ] -[[package]] -name = "mime" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" - [[package]] name = "miniz_oxide" version = "0.7.4" @@ -851,6 +1409,15 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.19" @@ -877,22 +1444,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25a0c4b3a0e31f8b66f71ad8064521efa773910196e2cde791436f13409f3b45" dependencies = [ "async-trait", - "base64", "bytes", "chrono", "futures", "humantime", - "hyper", "itertools", - "md-5", "parking_lot", "percent-encoding", - "quick-xml", - "rand", - "reqwest", - "ring", - "serde", - "serde_json", "snafu", "tokio", "tracing", @@ -912,6 +1470,23 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +[[package]] +name = "outref" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4030760ffd992bef45b0ae3f10ce1aba99e33464c90d14dd7c039884963ddc7a" + +[[package]] +name = "p256" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51f44edd08f51e2ade572f141051021c5af22677e42b7dd28a88155151c33594" +dependencies = [ + "ecdsa", + "elliptic-curve", + "sha2", +] + [[package]] name = "parking_lot" version = "0.12.3" @@ -932,7 +1507,7 @@ dependencies = [ "libc", "redox_syscall", "smallvec", - "windows-targets 0.52.6", + "windows-targets", ] [[package]] @@ -947,26 +1522,6 @@ version = "2.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" -[[package]] -name = "pin-project" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bf43b791c5b9e34c3d182969b4abb522f9343702850a2e57f460d00d09b4b3" -dependencies = [ - "pin-project-internal", -] - -[[package]] -name = "pin-project-internal" -version = "1.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" -dependencies = [ - "proc-macro2", - "quote", - "syn", -] - [[package]] name = "pin-project-lite" version = "0.2.14" @@ -979,6 +1534,16 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" +[[package]] +name = "pkcs8" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9eca2c590a5f85da82668fa685c09ce2888b9430e83299debf1f34b65fd4a4ba" +dependencies = [ + "der", + "spki", +] + [[package]] name = "portable-atomic" version = "1.7.0" @@ -1131,16 +1696,6 @@ version = "1.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1d01941d82fa2ab50be1e79e6714289dd7cde78eba4c074bc5a4374f650dfe0" -[[package]] -name = "quick-xml" -version = "0.36.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96a05e2e8efddfa51a84ca47cec303fac86c8541b686d37cac5efc0e094417bc" -dependencies = [ - "memchr", - "serde", -] - [[package]] name = "quick_cache" version = "0.6.9" @@ -1153,54 +1708,6 @@ dependencies = [ "parking_lot", ] -[[package]] -name = "quinn" -version = "0.11.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b22d8e7369034b9a7132bc2008cac12f2013c8132b45e0554e6e20e2617f2156" -dependencies = [ - "bytes", - "pin-project-lite", - "quinn-proto", - "quinn-udp", - "rustc-hash", - "rustls", - "socket2", - "thiserror", - "tokio", - "tracing", -] - -[[package]] -name = "quinn-proto" -version = "0.11.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fadfaed2cd7f389d0161bb73eeb07b7b78f8691047a6f3e73caaeae55310a4a6" -dependencies = [ - "bytes", - "rand", - "ring", - "rustc-hash", - "rustls", - "slab", - "thiserror", - "tinyvec", - "tracing", -] - -[[package]] -name = "quinn-udp" -version = "0.5.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8bffec3605b73c6f1754535084a85229fa8a30f86014e6c81aeec4abb68b0285" -dependencies = [ - "libc", - "once_cell", - "socket2", - "tracing", - "windows-sys 0.52.0", -] - [[package]] name = "quote" version = "1.0.36" @@ -1258,6 +1765,12 @@ dependencies = [ "bitflags", ] +[[package]] +name = "regex-lite" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53a49587ad06b26609c52e423de037e7f57f20d53535d66e08c695f347df952a" + [[package]] name = "regex-syntax" version = "0.8.4" @@ -1265,48 +1778,14 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] -name = "reqwest" -version = "0.12.5" +name = "rfc6979" +version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +checksum = "7743f17af12fa0b03b803ba12cd6a8d9483a587e89c69445e3909655c0b9fabb" dependencies = [ - "base64", - "bytes", - "futures-core", - "futures-util", - "h2", - "http", - "http-body", - "http-body-util", - "hyper", - "hyper-rustls", - "hyper-util", - "ipnet", - "js-sys", - "log", - "mime", - "once_cell", - "percent-encoding", - "pin-project-lite", - "quinn", - "rustls", - "rustls-native-certs", - "rustls-pemfile", - "rustls-pki-types", - "serde", - "serde_json", - "serde_urlencoded", - "sync_wrapper", - "tokio", - "tokio-rustls", - "tokio-util", - "tower-service", - "url", - "wasm-bindgen", - "wasm-bindgen-futures", - "wasm-streams", - "web-sys", - "winreg", + "crypto-bigint 0.4.9", + "hmac", + "zeroize", ] [[package]] @@ -1346,6 +1825,18 @@ dependencies = [ "serde", ] +[[package]] +name = "rmpv" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58450723cd9ee93273ce44a20b6ec4efe17f8ed2e3631474387bfdecf18bb2a9" +dependencies = [ + "num-traits", + "rmp", + "serde", + "serde_bytes", +] + [[package]] name = "rustc-demangle" version = "0.1.24" @@ -1353,16 +1844,19 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] -name = "rustc-hash" -version = "2.0.0" +name = "rustc_version" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] [[package]] name = "rustix" -version = "0.38.34" +version = "0.38.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70dc5ec042f7a43c4a73241207cecc9873a06d45debb38b329f8541d85c2730f" +checksum = "8acb788b847c24f28525660c4d7758620a7210875711f79e7f663cc152726811" dependencies = [ "bitflags", "errno", @@ -1373,55 +1867,44 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.12" +version = "0.21.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c58f8c84392efc0a126acce10fa59ff7b3d2ac06ab451a33f2741989b806b044" +checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e" dependencies = [ - "once_cell", + "log", "ring", - "rustls-pki-types", "rustls-webpki", - "subtle", - "zeroize", + "sct", ] [[package]] name = "rustls-native-certs" -version = "0.7.1" +version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a88d6d420651b496bdd98684116959239430022a115c1240e6c3993be0b15fba" +checksum = "a9aace74cb666635c918e9c12bc0d348266037aa8eb599b5cba565709a8dff00" dependencies = [ "openssl-probe", "rustls-pemfile", - "rustls-pki-types", "schannel", "security-framework", ] [[package]] name = "rustls-pemfile" -version = "2.1.3" +version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "196fe16b00e106300d3e45ecfcb764fa292a535d7326a29a5875c579c7417425" +checksum = "1c74cae0a4cf6ccbbf5f359f08efdf8ee7e1dc532573bf0db71968cb56b1448c" dependencies = [ - "base64", - "rustls-pki-types", + "base64 0.21.7", ] -[[package]] -name = "rustls-pki-types" -version = "1.8.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" - [[package]] name = "rustls-webpki" -version = "0.102.6" +version = "0.101.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e" +checksum = "8b6275d1ee7a1cd780b64aca7726599a1dbc893b1e64144529e55c3c2f745765" dependencies = [ "ring", - "rustls-pki-types", "untrusted", ] @@ -1467,6 +1950,30 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "sct" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da046153aa2352493d6cb7da4b6e5c0c057d8a1d0a9aa8560baffdd945acd414" +dependencies = [ + "ring", + "untrusted", +] + +[[package]] +name = "sec1" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be24c1842290c45df0a7bf069e0c268a747ad05a192f2fd7dcfdbc1cba40928" +dependencies = [ + "base16ct", + "der", + "generic-array", + "pkcs8", + "subtle", + "zeroize", +] + [[package]] name = "security-framework" version = "2.11.1" @@ -1490,6 +1997,12 @@ dependencies = [ "libc", ] +[[package]] +name = "semver" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" + [[package]] name = "serde" version = "1.0.210" @@ -1499,6 +2012,15 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde_bytes" +version = "0.11.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "387cc504cb06bb40a96c8e04e951fe01854cf6bc921053c954e4a606d9675c6a" +dependencies = [ + "serde", +] + [[package]] name = "serde_derive" version = "1.0.210" @@ -1522,25 +2044,13 @@ dependencies = [ "serde", ] -[[package]] -name = "serde_urlencoded" -version = "0.7.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" -dependencies = [ - "form_urlencoded", - "itoa", - "ryu", - "serde", -] - [[package]] name = "serde_with" version = "3.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69cecfa94848272156ea67b2b1a53f20fc7bc638c4a46d2f8abde08f05f4b857" dependencies = [ - "base64", + "base64 0.22.1", "chrono", "hex", "indexmap 1.9.3", @@ -1564,6 +2074,47 @@ dependencies = [ "syn", ] +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "793db75ad2bcafc3ffa7c68b215fee268f537982cd901d132f89c6343f3a3dc8" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "signal-hook-registry" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9e9e0b4211b72e7b8b6e85c807d36c212bdb33ea8587f7569562a84df5465b1" +dependencies = [ + "libc", +] + +[[package]] +name = "signature" +version = "1.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74233d3b3b2f6d4b006dc19dee745e73e2a6bfb6f93607cd3b02bd5b00797d7c" +dependencies = [ + "digest", + "rand_core", +] + [[package]] name = "slab" version = "0.4.9" @@ -1616,6 +2167,16 @@ version = "0.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" +[[package]] +name = "spki" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67cf02bbac7a337dc36e4f5a693db6c21e7863f45070f7064577eb4367a3212b" +dependencies = [ + "base64ct", + "der", +] + [[package]] name = "strsim" version = "0.11.1" @@ -1662,12 +2223,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "sync_wrapper" -version = "1.0.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" - [[package]] name = "target-lexicon" version = "0.12.16" @@ -1676,9 +2231,9 @@ checksum = "61c41af27dd6d1e27b1b16b489db798443478cef1f06a660c96db617ba5de3b1" [[package]] name = "tempfile" -version = "3.12.0" +version = "3.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04cbcdd0c794ebb0d4cf35e88edd2f7d2c4c3e9a5a6dab322839b321c6a87a64" +checksum = "f0f2c9fc62d0beef6951ccffd757e241266a2c833136efbe35af6cd2567dca5b" dependencies = [ "cfg-if", "fastrand", @@ -1776,6 +2331,7 @@ dependencies = [ "libc", "mio", "pin-project-lite", + "signal-hook-registry", "socket2", "tokio-macros", "windows-sys 0.52.0", @@ -1794,12 +2350,11 @@ dependencies = [ [[package]] name = "tokio-rustls" -version = "0.26.0" +version = "0.24.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" +checksum = "c28327cf380ac148141087fbfb9de9d7bd4e84ab5d2c28fbc911d753de8a7081" dependencies = [ "rustls", - "rustls-pki-types", "tokio", ] @@ -1816,27 +2371,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "pin-project", - "pin-project-lite", - "tokio", - "tower-layer", - "tower-service", -] - -[[package]] -name = "tower-layer" -version = "0.3.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" - [[package]] name = "tower-service" version = "0.3.3" @@ -1880,6 +2414,12 @@ version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" +[[package]] +name = "typed-path" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c0c7479c430935701ff2532e3091e6680ec03f2f89ffcd9988b08e885b90a5" + [[package]] name = "typenum" version = "1.17.0" @@ -1936,12 +2476,30 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + +[[package]] +name = "uuid" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" + [[package]] name = "version_check" version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + [[package]] name = "wait-timeout" version = "0.2.0" @@ -2001,18 +2559,6 @@ dependencies = [ "wasm-bindgen-shared", ] -[[package]] -name = "wasm-bindgen-futures" -version = "0.4.42" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" -dependencies = [ - "cfg-if", - "js-sys", - "wasm-bindgen", - "web-sys", -] - [[package]] name = "wasm-bindgen-macro" version = "0.2.92" @@ -2042,29 +2588,6 @@ version = "0.2.92" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" -[[package]] -name = "wasm-streams" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b65dc4c90b63b118468cf747d8bf3566c1913ef60be765b5730ead9e0a3ba129" -dependencies = [ - "futures-util", - "js-sys", - "wasm-bindgen", - "wasm-bindgen-futures", - "web-sys", -] - -[[package]] -name = "web-sys" -version = "0.3.69" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - [[package]] name = "winapi-util" version = "0.1.9" @@ -2080,16 +2603,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" dependencies = [ - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-sys" -version = "0.48.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" -dependencies = [ - "windows-targets 0.48.5", + "windows-targets", ] [[package]] @@ -2098,7 +2612,7 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" dependencies = [ - "windows-targets 0.52.6", + "windows-targets", ] [[package]] @@ -2107,22 +2621,7 @@ version = "0.59.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" dependencies = [ - "windows-targets 0.52.6", -] - -[[package]] -name = "windows-targets" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" -dependencies = [ - "windows_aarch64_gnullvm 0.48.5", - "windows_aarch64_msvc 0.48.5", - "windows_i686_gnu 0.48.5", - "windows_i686_msvc 0.48.5", - "windows_x86_64_gnu 0.48.5", - "windows_x86_64_gnullvm 0.48.5", - "windows_x86_64_msvc 0.48.5", + "windows-targets", ] [[package]] @@ -2131,46 +2630,28 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" dependencies = [ - "windows_aarch64_gnullvm 0.52.6", - "windows_aarch64_msvc 0.52.6", - "windows_i686_gnu 0.52.6", + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", "windows_i686_gnullvm", - "windows_i686_msvc 0.52.6", - "windows_x86_64_gnu 0.52.6", - "windows_x86_64_gnullvm 0.52.6", - "windows_x86_64_msvc 0.52.6", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", ] -[[package]] -name = "windows_aarch64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" - [[package]] name = "windows_aarch64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" -[[package]] -name = "windows_aarch64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" - [[package]] name = "windows_aarch64_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" -[[package]] -name = "windows_i686_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" - [[package]] name = "windows_i686_gnu" version = "0.52.6" @@ -2183,48 +2664,24 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" -[[package]] -name = "windows_i686_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" - [[package]] name = "windows_i686_msvc" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" -[[package]] -name = "windows_x86_64_gnu" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" - [[package]] name = "windows_x86_64_gnu" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" -[[package]] -name = "windows_x86_64_gnullvm" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" - [[package]] name = "windows_x86_64_gnullvm" version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" -[[package]] -name = "windows_x86_64_msvc" -version = "0.48.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" - [[package]] name = "windows_x86_64_msvc" version = "0.52.6" @@ -2232,14 +2689,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] -name = "winreg" -version = "0.52.0" +name = "xmlparser" +version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" -dependencies = [ - "cfg-if", - "windows-sys 0.48.0", -] +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" [[package]] name = "yansi" diff --git a/Cargo.toml b/Cargo.toml index 60af7b42..03e38d03 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -7,3 +7,9 @@ resolver = "2" expect_used = "warn" unwrap_used = "warn" panic = "warn" + +[workspace.metadata.release] +allow-branch = ["main"] +sign-commit = true +sign-tag = true +push = false diff --git a/Changelog.md b/Changelog.md new file mode 120000 index 00000000..c838787b --- /dev/null +++ b/Changelog.md @@ -0,0 +1 @@ +Changelog.python.md \ No newline at end of file diff --git a/Changelog.python.md b/Changelog.python.md new file mode 100644 index 00000000..89d7fe93 --- /dev/null +++ b/Changelog.python.md @@ -0,0 +1,9 @@ +# Changelog + + +## Python Icechunk Library 0.1.0a2 + +### Features + +- Initial release + diff --git a/Changelog.rust.md b/Changelog.rust.md new file mode 100644 index 00000000..3012aa90 --- /dev/null +++ b/Changelog.rust.md @@ -0,0 +1,17 @@ +# Changelog + +## Rust Icechunk Library 0.1.0-alpha.3 + +### Features + +- Added new `Store::list_dir_items` method and `ListDirItem` type to distinguish keys and + prefixes during `list_dir` operations. +- New `ByteRange` type allows retrieving the final `n` bytes of a chunk. + + +## Rust Icechunk Library 0.1.0-alpha.2 + +### Features + +- Initial release + diff --git a/Justfile b/Justfile index 0fdfb0fc..c47e4640 100644 --- a/Justfile +++ b/Justfile @@ -3,7 +3,7 @@ alias pre := pre-commit # run all tests test *args='': - AWS_ALLOW_HTTP=1 AWS_ENDPOINT_URL=http://localhost:9000 AWS_ACCESS_KEY_ID=minio123 AWS_SECRET_ACCESS_KEY=minio123 cargo test {{args}} + cargo test --all {{args}} # compile but don't run all tests compile-tests *args='': @@ -37,4 +37,11 @@ run-all-examples: for example in icechunk/examples/*.rs; do cargo run --example "$(basename "${example%.rs}")"; done # run all checks that CI actions will run -pre-commit $RUSTFLAGS="-D warnings -W unreachable-pub -W bare-trait-objects": (compile-tests "--locked") build (format "--check") lint test run-all-examples check-deps +pre-commit $RUSTFLAGS="-D warnings -W unreachable-pub -W bare-trait-objects": + just compile-tests "--locked" + just build + just format "--check" + just lint + just test + just run-all-examples + just check-deps diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..bf4a506c --- /dev/null +++ b/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2024 Earthmover PBC + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/README.md b/README.md index 48260b05..3c284c86 100644 --- a/README.md +++ b/README.md @@ -1,153 +1,92 @@ # Icechunk -![Icechunk logo](icechunk_logo.png) +![Icechunk logo](https://raw.githubusercontent.com/earth-mover/icechunk/refs/heads/main/docs/docs/assets/logo.svg) -Icechunk is a transactional storage engine for Zarr designed for use on cloud object storage. +PyPI +Crates.io +GitHub Repo stars +Earthmover Community Slack -Let's break down what that means: +--- + +Icechunk is an open-source (Apache 2.0), transactional storage engine for tensor / ND-array data designed for use on cloud object storage. +Icechunk works together with **[Zarr](https://zarr.dev/)**, augmenting the Zarr core data model with features +that enhance performance, collaboration, and safety in a cloud-computing context. + +## Documentation and Resources + +- This page: a general overview of the project's goals and components. +- [Icechunk Launch Blog Post](https://earthmover.io/blog/icechunk) +- [Frequently Asked Questions](https://icechunk.io/faq) +- Documentation for [Icechunk Python](https://icechunk.io/icechunk-python), the main user-facing + library +- Documentation for the [Icechunk Rust Crate](https://icechunk.io/icechunk-rust) +- The [Contributor Guide](https://icechunk.io/contributing) +- The [Icechunk Spec](https://icechunk.io/spec) + +## Icechunk Overview + +Let's break down what "transactional storage engine for Zarr" actually means: - **[Zarr](https://zarr.dev/)** is an open source specification for the storage of multidimensional array (a.k.a. tensor) data. Zarr defines the metadata for describing arrays (shape, dtype, etc.) and the way these arrays are chunked, compressed, and converted to raw bytes for storage. Zarr can store its data in any key-value store. + There are many different implementations of Zarr in different languages. _Right now, Icechunk only supports + [Zarr Python](https://zarr.readthedocs.io/en/stable/)._ + If you're interested in implementing Icehcunk support, please [open an issue](https://github.com/earth-mover/icechunk/issues) so we can help you. - **Storage engine** - Icechunk exposes a key-value interface to Zarr and manages all of the actual I/O for getting, setting, and updating both metadata and chunk data in cloud object storage. Zarr libraries don't have to know exactly how icechunk works under the hood in order to use it. - **Transactional** - The key improvement that Icechunk brings on top of regular Zarr is to provide consistent serializable isolation between transactions. This means that Icechunk data are safe to read and write in parallel from multiple uncoordinated processes. This allows Zarr to be used more like a database. -## Goals of Icechunk - -The core entity in Icechunk is a **store**. -A store is defined as a Zarr hierarchy containing one or more Arrays and Groups. -The most common scenario is for an Icechunk store to contain a single Zarr group with multiple arrays, each corresponding to different physical variables but sharing common spatiotemporal coordinates. -However, formally a store can be any valid Zarr hierarchy, from a single Array to a deeply nested structure of Groups and Arrays. -Users of Icechunk should aim to scope their stores only to related arrays and groups that require consistent transactional updates. - -Icechunk aspires to support the following core requirements for stores: - -1. **Object storage** - the format is designed around the consistency features and performance characteristics available in modern cloud object storage. No external database or catalog is required to maintain a store. -1. **Serializable isolation** - Reads are isolated from concurrent writes and always use a committed snapshot of a store. Writes are committed atomically and are never partially visible. Readers will not acquire locks. -1. **Time travel** - Previous snapshots of a store remain accessible after new ones have been written. -1. **Data Version Control** - Stores support both _tags_ (immutable references to snapshots) and _branches_ (mutable references to snapshots). -1. **Chunk sharding and references** - Chunk storage is decoupled from specific file names. Multiple chunks can be packed into a single object (sharding). Zarr-compatible chunks within other file formats (e.g. HDF5, NetCDF) can be referenced. -1. **Schema Evolution** - Arrays and Groups can be added, renamed, and removed from the hierarchy with minimal overhead. - -## The Project - -This Icechunk project consists of three main parts: - -1. The [Icechunk specification](spec/icechunk_spec.md). -1. A Rust implementation -1. A Python wrapper which exposes a Zarr store interface - -All of this is open source, licensed under the Apache 2.0 license. - -The Rust implementation is a solid foundation for creating bindings in any language. -We encourage adopters to collaborate on the Rust implementation, rather than reimplementing Icechunk from scratch in other languages. - -We encourage collaborators from the broader community to contribute to Icechunk. -Governance of the project will be managed by Earthmover PBC. - -## How Can I Use It? - -We recommend using Icechunk from Python, together with the Zarr-Python library - -> [!WARNING] -> Icechunk is a very new project. -> It is not recommended for production use at this time. -> These instructions are aimed at Icechunk developers and curious early adopters. - -### Installation and Dependencies - -Icechunk is currently designed to support the [Zarr V3 Specification](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html). -Using it today requires installing the [still unreleased] Zarr Python V3 branch. - -To set up an Icechunk development environment, follow these steps - -Activate your preferred virtual environment (here we use `virtualenv`): - -```bash -python3 -m venv .venv -source .venv/bin/activate -``` - -Alternatively, create a conda environment - -```bash -mamba create -n icechunk rust python=3.12 -conda activate icechunk -``` - -Install `maturin`: - -```bash -pip install maturin -``` - -Build the project in dev mode: - -```bash -maturin develop -``` - -or build the project in editable mode: +The core entity in Icechunk is a repository or **repo**. +A repo is defined as a Zarr hierarchy containing one or more Arrays and Groups, and a repo functions as +self-contained _Zarr Store_. +The most common scenario is for an Icechunk repo to contain a single Zarr group with multiple arrays, each corresponding to different physical variables but sharing common spatiotemporal coordinates. +However, formally a repo can be any valid Zarr hierarchy, from a single Array to a deeply nested structure of Groups and Arrays. +Users of Icechunk should aim to scope their repos only to related arrays and groups that require consistent transactional updates. -```bash -pip install -e icechunk@. -``` +Icechunk supports the following core requirements: -> [!WARNING] -> This only makes the python source code editable, the rust will need to -> be recompiled when it changes +1. **Object storage** - the format is designed around the consistency features and performance characteristics available in modern cloud object storage. No external database or catalog is required to maintain a repo. +(It also works with file storage.) +1. **Serializable isolation** - Reads are isolated from concurrent writes and always use a committed snapshot of a repo. Writes are committed atomically and are never partially visible. No locks are required for reading. +1. **Time travel** - Previous snapshots of a repo remain accessible after new ones have been written. +1. **Data version control** - Repos support both _tags_ (immutable references to snapshots) and _branches_ (mutable references to snapshots). +1. **Chunk shardings** - Chunk storage is decoupled from specific file names. Multiple chunks can be packed into a single object (sharding). +1. **Chunk references** - Zarr-compatible chunks within other file formats (e.g. HDF5, NetCDF) can be referenced. +1. **Schema evolution** - Arrays and Groups can be added, renamed, and removed from the hierarchy with minimal overhead. -### Basic Usage +## Key Concepts -Once you have everything installed, here's an example of how to use Icechunk. +### Groups, Arrays, and Chunks -```python -from icechunk import IcechunkStore, Storage -from zarr import Array, Group +Icechunk is designed around the Zarr data model, widely used in scientific computing, data science, and AI / ML. +(The Zarr high-level data model is effectively the same as HDF5.) +The core data structure in this data model is the **array**. +Arrays have two fundamental properties: +- **shape** - a tuple of integers which specify the dimensions of each axis of the array. A 10 x 10 square array would have shape (10, 10) +- **data type** - a specification of what type of data is found in each element, e.g. integer, float, etc. Different data types have different precision (e.g. 16-bit integer, 64-bit float, etc.) -# Example using memory store -storage = Storage.memory("test") -store = await IcechunkStore.open(storage=storage, mode='r+') +In Zarr / Icechunk, arrays are split into **chunks**. +A chunk is the minimum unit of data that must be read / written from storage, and thus choices about chunking have strong implications for performance. +Zarr leaves this completely up to the user. +Chunk shape should be chosen based on the anticipated data access pattern for each array. +An Icechunk array is not bounded by an individual file and is effectively unlimited in size. -# Example using file store -storage = Storage.filesystem("/path/to/root") -store = await IcechunkStore.open(storage=storage, mode='r+') +For further organization of data, Icechunk supports **groups** within a single repo. +Group are like folders which contain multiple arrays and or other groups. +Groups enable data to be organized into hierarchical trees. +A common usage pattern is to store multiple arrays in a group representing a NetCDF-style dataset. -# Example using S3 -s3_storage = Storage.s3_from_env(bucket="icechunk-test", prefix="oscar-demo-repository") -store = await IcechunkStore.open(storage=storage, mode='r+') -``` +Arbitrary JSON-style key-value metadata can be attached to both arrays and groups. -## Running Tests - -You will need [`docker compose`](https://docs.docker.com/compose/install/) and (optionally) [`just`](https://just.systems/). -Once those are installed, first switch to the icechunk root directory, then start up a local minio server: -``` -docker compose up -d -``` - -Use `just` to conveniently run a test -``` -just test -``` - -This is just an alias for - -``` -AWS_ALLOW_HTTP=1 AWS_ENDPOINT_URL=http://localhost:9000 AWS_ACCESS_KEY_ID=minio123 AWS_SECRET_ACCESS_KEY=minio123 cargo test -``` - -> [!TIP] -> For other aliases see [Justfile](./Justfile). - -## Snapshots, Branches, and Tags +### Snapshots Every update to an Icechunk store creates a new **snapshot** with a unique ID. Icechunk users must organize their updates into groups of related operations called **transactions**. -For example, appending a new time slice to mutliple arrays should be done as single transactions, comprising the following steps +For example, appending a new time slice to multiple arrays should be done as a single transaction, comprising the following steps 1. Update the array metadata to resize the array to accommodate the new elements. 2. Write new chunks for each array in the group. @@ -155,22 +94,30 @@ While the transaction is in progress, none of these changes will be visible to o Once the transaction is committed, a new snapshot is generated. Readers can only see and use committed snapshots. -Additionally, snapshots occur in a specific linear (i.e. serializable) order within **branch**. +### Branches and Tags + +Additionally, snapshots occur in a specific linear (i.e. serializable) order within a **branch**. A branch is a mutable reference to a snapshot--a pointer that maps the branch name to a snapshot ID. The default branch is `main`. Every commit to the main branch updates this reference. Icechunk's design protects against the race condition in which two uncoordinated sessions attempt to update the branch at the same time; only one can succeed. -Finally, Icechunk defines **tags**--_immutable_ references to snapshot. +Icechunk also defines **tags**--_immutable_ references to snapshot. Tags are appropriate for publishing specific releases of a repository or for any application which requires a persistent, immutable identifier to the store state. +### Chunk References + +Chunk references are "pointers" to chunks that exist in other files--HDF5, NetCDF, GRIB, etc. +Icechunk can store these references alongside native Zarr chunks as "virtual datasets". +You can then update these virtual datasets incrementally (overwrite chunks, change metadata, etc.) without touching the underling files. + ## How Does It Work? -> [!NOTE] -> For more detailed explanation, have a look at the [Icechunk spec](spec/icechunk_spec.md) +**!!! Note:** + For more detailed explanation, have a look at the [Icechunk spec](./docs/docs/spec.md). Zarr itself works by storing both metadata and chunk data into a abstract store according to a specified system of "keys". -For example, a 2D Zarr array called myarray, within a group called mygroup, would generate the following keys. +For example, a 2D Zarr array called `myarray`, within a group called `mygroup`, would generate the following keys: ``` mygroup/zarr.json @@ -180,10 +127,11 @@ mygroup/myarray/c/0/1 ``` In standard regular Zarr stores, these key map directly to filenames in a filesystem or object keys in an object storage system. -When writing data, a Zarr implementation will create these keys and populate them with data. When modifying existing arrays or groups, a Zarr implementation will potentially overwrite existing keys with new data. +When writing data, a Zarr implementation will create these keys and populate them with data. +When modifying existing arrays or groups, a Zarr implementation will potentially overwrite existing keys with new data. This is generally not a problem, as long there is only one person or process coordinating access to the data. -However, when multiple uncoordinated readers and writers attempt to access the same Zarr data at the same time, [various consistency problems](https://docs.earthmover.io/concepts/version-control-system#consistency-problems-with-zarr) problems emerge. +However, when multiple uncoordinated readers and writers attempt to access the same Zarr data at the same time, [various consistency problems](https://docs.earthmover.io/concepts/version-control-system#consistency-problems-with-zarr) emerge. These consistency problems can occur in both file storage and object storage; they are particularly severe in a cloud setting where Zarr is being used as an active store for data that are frequently changed while also being read. With Icechunk, we keep the same core Zarr data model, but add a layer of indirection between the Zarr keys and the on-disk storage. @@ -197,29 +145,3 @@ flowchart TD zarr-python[Zarr Library] <-- key / value--> icechunk[Icechunk Library] icechunk <-- data / metadata files --> storage[(Object Storage)] ``` - -## FAQ - -1. _Why not just use Iceberg directly?_ - - Iceberg and all other "table formats" (Delta, Hudi, LanceDB, etc.) are based on tabular data model. - This data model cannot accommodate large, multidimensional arrays (tensors) in a general, scalable way. - -1. Is Icechunk part of Zarr? - - Formally, no. - Icechunk is a separate specification from Zarr. - However, it is designed to interoperate closely with Zarr. - In the future, we may propose a more formal integration between the Zarr spec and Icechunk spec if helpful. - For now, keeping them separate allows us to evolve Icechunk quickly while maintaining the stability and backwards compatibility of the Zarr data model. - -## Inspiration - -Icechunk's was inspired by several existing projects and formats, most notably - -- [FSSpec Reference Filesystem](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.implementations.reference.ReferenceFileSystem) -- [Apache Iceberg](https://iceberg.apache.org/spec/) -- [LanceDB](https://lancedb.github.io/lance/format.html) -- [TileDB](https://docs.tiledb.com/main/background/key-concepts-and-data-format) -- [OCDBT](https://google.github.io/tensorstore/kvstore/ocdbt/index.html) - diff --git a/compose.yaml b/compose.yaml index 6034aac4..5f3ee303 100644 --- a/compose.yaml +++ b/compose.yaml @@ -4,7 +4,7 @@ volumes: services: minio: container_name: icechunk_minio - image: quay.io/minio/minio + image: minio/minio entrypoint: | /bin/sh -c ' for bucket in testbucket externalbucket arraylake-repo-bucket diff --git a/docs/.env.sample b/docs/.env.sample new file mode 100644 index 00000000..4b87c8e8 --- /dev/null +++ b/docs/.env.sample @@ -0,0 +1 @@ +MKDOCS_GIT_COMMITTERS_APIKEY= \ No newline at end of file diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 00000000..9fe706f9 --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,78 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# imported files +docs/icechunk-python/examples +docs/icechunk-python/notebooks + +# C extensions +*.so + +# Build +.site + +.env +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +node_modules/ +parts/ +sdist/ +var/ +package*.json +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo + +# Scrapy stuff: +.scrapy + +# PyBuilder +target/ + +# IPython Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# virtualenv +venv/ +ENV/ + +# MkDocs documentation +site*/ \ No newline at end of file diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..f08ba394 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,50 @@ +# Icechunk Documentation Website + +Built with [MkDocs](https://www.mkdocs.org/) using [Material for MkDocs](https://squidfunk.github.io/mkdocs-material/). + +## Developing + +### Prerequisites + +This repository uses [Poetry](https://python-poetry.org/) to manage dependencies + +1. Install dependencies using `poetry install` + +### Running + +1. Run `poetry shell` from the `/docs` directory +2. Start the MkDocs development server: `mkdocs serve` + +Alternatively you can run `poetry run mkdocs serve` + +!!! tip + You can use the optional `--dirty` flag to only rebuild changed files, although you may need to restart if you make changes to `mkdocs.yaml`. + +### Building + +1. Run `mkdocs build` + +Builds output to: `icechunk-docs/.site` directory. + + +### Deploying + +Docs are automatically deployed upon commits to `main` branch via the `./github/workflows/deploy-docs.yaml` action. + +You can manually deploy by running the command `mkdocs gh-deploy --force` from the directory containing the `mkdocs.yml` file. + +## Dev Notes + +#### Symlinked Files + +Several directories and files are symlinked into the MkDocs' `/docs`[^1] directory in order to be made available to MkDocs. Avoid modifying them directly: + * `/docs/icechunk-python/examples/` + * `/docs/icechunk-python/notebooks/` + * `/docs/spec.md` + +These are also ignored in `.gitignore` + +!!! tip + See [icechunk-docs/macros.py](./macros.py) for more info. + +[^1]: Disambiguation: `icechunk/docs/docs` \ No newline at end of file diff --git a/docs/docs/CNAME b/docs/docs/CNAME new file mode 100644 index 00000000..a58d1b90 --- /dev/null +++ b/docs/docs/CNAME @@ -0,0 +1 @@ +icechunk.io \ No newline at end of file diff --git a/docs/docs/assets/datasets/oisst.png b/docs/docs/assets/datasets/oisst.png new file mode 100644 index 00000000..b81c9faa Binary files /dev/null and b/docs/docs/assets/datasets/oisst.png differ diff --git a/docs/docs/assets/earthmover/lockup.svg b/docs/docs/assets/earthmover/lockup.svg new file mode 100755 index 00000000..73ff51ae --- /dev/null +++ b/docs/docs/assets/earthmover/lockup.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/docs/assets/earthmover/workmark.svg b/docs/docs/assets/earthmover/workmark.svg new file mode 100755 index 00000000..11e8c955 --- /dev/null +++ b/docs/docs/assets/earthmover/workmark.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/docs/assets/favicon/favicon-96x96.png b/docs/docs/assets/favicon/favicon-96x96.png new file mode 100644 index 00000000..fee83510 Binary files /dev/null and b/docs/docs/assets/favicon/favicon-96x96.png differ diff --git a/docs/docs/assets/hero/heart.svg b/docs/docs/assets/hero/heart.svg new file mode 100644 index 00000000..a5062563 --- /dev/null +++ b/docs/docs/assets/hero/heart.svg @@ -0,0 +1,77 @@ + + + + + + + + + + + + + + diff --git a/docs/docs/assets/hero/hero-bottom-dark.svg b/docs/docs/assets/hero/hero-bottom-dark.svg new file mode 100644 index 00000000..862f7fde --- /dev/null +++ b/docs/docs/assets/hero/hero-bottom-dark.svg @@ -0,0 +1,98 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/docs/assets/hero/hero-bottom.svg b/docs/docs/assets/hero/hero-bottom.svg new file mode 100644 index 00000000..1385c033 --- /dev/null +++ b/docs/docs/assets/hero/hero-bottom.svg @@ -0,0 +1,98 @@ + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/docs/assets/hero/ice-1-dark.svg b/docs/docs/assets/hero/ice-1-dark.svg new file mode 100644 index 00000000..4466f249 --- /dev/null +++ b/docs/docs/assets/hero/ice-1-dark.svg @@ -0,0 +1,79 @@ + + + + + + + + + + + + + + + + + + + diff --git a/docs/docs/assets/hero/ice-1.svg b/docs/docs/assets/hero/ice-1.svg new file mode 100644 index 00000000..c35b6e2b --- /dev/null +++ b/docs/docs/assets/hero/ice-1.svg @@ -0,0 +1,76 @@ + + + + + + + + + + + + + + + + + + diff --git a/docs/docs/assets/logo-wire.svg b/docs/docs/assets/logo-wire.svg new file mode 100644 index 00000000..fed114fd --- /dev/null +++ b/docs/docs/assets/logo-wire.svg @@ -0,0 +1,17 @@ + + + + + + diff --git a/docs/docs/assets/logo.svg b/docs/docs/assets/logo.svg new file mode 100644 index 00000000..4714c06a --- /dev/null +++ b/docs/docs/assets/logo.svg @@ -0,0 +1,1110 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/docs/docs/contributing.md b/docs/docs/contributing.md new file mode 100644 index 00000000..9a6efe55 --- /dev/null +++ b/docs/docs/contributing.md @@ -0,0 +1,103 @@ +--- +title: Contributing +--- +# Contributing + +👋 Hi! Thanks for your interest in contributing to Icechunk! + +Icechunk is an open source (Apache 2.0) project and welcomes contributions in the form of: + +- Usage questions - [open a GitHub issue](https://github.com/earth-mover/icechunk/issues) +- Bug reports - [open a GitHub issue](https://github.com/earth-mover/icechunk/issues) +- Feature requests - [open a GitHub issue](https://github.com/earth-mover/icechunk/issues) +- Documentation improvements - [open a GitHub pull request](https://github.com/earth-mover/icechunk/pulls) +- Bug fixes and enhancements - [open a GitHub pull request](https://github.com/earth-mover/icechunk/pulls) + + +## Development + +### Python Development Workflow + +Create / activate a virtual environment: + +=== "Venv" + + ```bash + python3 -m venv .venv + source .venv/bin/activate + ``` + +=== "Conda / Mamba" + + ```bash + mamba create -n icechunk python=3.12 rust zarr + mamba activate icechunk + ``` + +Install `maturin`: + +```bash +pip install maturin +``` + +Build the project in dev mode: + +```bash +maturin develop +``` + +or build the project in editable mode: + +```bash +pip install -e icechunk@. +``` + + +### Rust Development Workflow + +TODO + +## Roadmap + +The initial release of Icechunk is just the beginning. We have a lot more planned for the format and the API. + +### Core format + +The core format is where we’ve put most of our effort to date and we plan to continue work in this area. Leading up to the 1.0 release, we will be focused on stabilizing data structures for snapshots, chunk manifests, attribute files and references. We’ll also document and add more mechanisms for on-disk format evolution. The intention is to guarantee that any new version of Icechunk can always read repositories generated with any previous versions. We expect to evolve the [spec](https://icechunk.io/spec/) and the Rust implementation as we stabilize things. + +### Optimizations + +While the initial performance benchmarks of Icechunk are very encouraging, we know that we have only scratched the surface of what is possible. We are looking forward to investing in a number of optimizations that will really make Icechunk fly! + +- Chunk compaction on write +- Request batching and splitting +- Manifest compression and serialization improvements +- Manifest split heuristics +- Bringing parts of the codec pipeline to the Rust side +- Better caching, in memory and optionally on local disk +- Performance statistics, tests, baseline and evolution + +### Other Utilities + +On top of the foundation of the Icechunk format, we are looking to build a suite of other utilities that operate on data stored in Icechunk. Some examples: + +- Garbage collection - version controlled data has the potential to accumulate data that is no longer needed but is still included in the store. A garbage collection process will allow users to safely cleanup data from old versions of an Icechunk dataset. +- Chunk compaction - data written by Zarr may result in many small chunks in object storage. A chunk compaction service will allow users to retroactively compact small chunks into larger objects (similar to Zarr’s sharding format), resulting in potential performance improvements and fewer objects in storage. +- Manifest optimization - knowing how the data is queried would allow to optimize the shape and splits of the chunk manifests, in such a way as to minimize the amount of data needed to execute the most frequent queries. + +### Zarr-related + +We’re very excited about a number of extensions to Zarr that would work great with Icechunk. + +- [Variable length chunks](https://zarr.dev/zeps/draft/ZEP0003.html) +- [Chunk-level statistics](https://zarr.dev/zeps/draft/ZEP0005.html) + +### Miscellaneous + +There’s much more than what we’ve written above on the roadmap. Some examples: + +- Distributed write support with `dask.array` +- Multi-language support (R, Julia, …) +- Exposing high level API (groups and arrays) to Python users +- Make more details of the format accessible through configuration +- Improve Xarray backend to integrate more directly with Icechunk diff --git a/docs/docs/faq.md b/docs/docs/faq.md new file mode 100644 index 00000000..4539555c --- /dev/null +++ b/docs/docs/faq.md @@ -0,0 +1,374 @@ +--- +title: Frequently Asked Questions +--- + +# FAQ + +## Why was Icechunk created? + +Icechunk was created by [Earthmover](https://earthmover.io/) as the open-source format for its cloud data platform [Arraylake](https://docs.earthmover.io). + +Icechunk builds on the successful [Zarr](https://zarr.dev) project. +Zarr is a great foundation for storing and querying large multidimensional array data in a flexible, scalable way. +But when people started using Zarr together with cloud object storage in a collaborative way, it became clear that Zarr alone could not offer the sort of consistency many users desired. +Icechunk makes Zarr work a little bit more like a database, enabling different users / processes to safely read and write concurrently, while still only using object storage as a persistence layer. + +Another motivation for Icechunk was the success of [Kerchunk](https://github.com/fsspec/kerchunk/). +The Kerchunk project showed that it was possible to map many existing archival formats (e.g. HDF5, NetCDF, GRIB) to the Zarr data model without actually rewriting any bytes, by creating "virtual" Zarr datasets referencing binary chunks inside other files. +Doing this at scale requires tracking millions of "chunk references." +Icechunk's storage model allows for these virtual chunks to be stored seamlessly alongside native Zarr chunks. + +Finally, Icechunk provides a universal I/O layer for cloud object storage, implementing numerous performance optimizations designed to accelerate data-intensive applications. + +Solving these problems in one go via a powerful, open-source, Rust-based library will bring massive benefits +to the cloud-native scientific data community. + +## Where does the name "Icechunk" come from? + +Icechunk was inspired partly by [Apache Iceberg](https://iceberg.apache.org/), a popular cloud-native table format. +However, instead of storing tabular data, Icechunk stores multidimensional arrays, for which the individual unit of +storage is the _chunk_. + +## When should I use Icechunk? + +Here are some scenarios where it makes sense to use Icechunk: + +- You want to store large, dynamically evolving multi-dimensional array (a.k.a. tensor) in cloud object storage. +- You want to allow multiple uncoordinated processes to access your data at the same time (like a database). +- You want to be able to safely roll back failed updates or revert Zarr data to an earlier state. +- You want to use concepts from data version control (e.g. tagging, branching, snapshots) with Zarr data. +- You want to achieve cloud-native performance on archival file formats (HDF5, NetCDF, GRIB) by exposing them as virtual Zarr datasets and need to store chunk references in a a robust, scalable, interoperable way. +- You want to get the best possible performance for reading / writing tensor data in AI / ML workflows. + +## What are the downsides to using Icechunk? + +As with all things in technology, the benefits of Icechunk come with some tradeoffs: + +- There may be slightly higher cold-start latency to opening Icechunk datasets compared with regular Zarr. +- The on-disk format is less transparent than regular Zarr. +- The process for distributed writes is more complex to coordinate. + +!!! warning + Another downside of Icechunk in its current state is its immaturity. + The library is very new, likely contains bugs, and is not recommended + for production usage at this point in time. + + +## What is Icechunk's relationship to Zarr? + +The Zarr format and protocol is agnostic to the underlying storage system ("store" in Zarr terminology) +and communicates with the store via a simple key / value interface. +Zarr tells the store which keys and values it wants to get or set, and it's the store's job +to figure out how to persist or retrieve the required bytes. + +Most existing Zarr stores have a simple 1:1 mapping between Zarr's keys and the underlying file / object names. +For example, if Zarr asks for a key call `myarray/c/0/0`, the store may just look up a key of the same name +in an underlying cloud object storage bucket. + +Icechunk is a storage engine which creates a layer of indirection between the +Zarr keys and the actual files in storage. +A Zarr library doesn't have to know explicitly how Icechunk works or how it's storing data on disk. +It just gets / sets keys as it would with any store. +Icechunk figures out how to materialize these keys based on its [storage schema](./spec.md). + +
+ +- __Standard Zarr + Fsspec__ + + --- + + In standard Zarr usage (without Icechunk), [fsspec](https://filesystem-spec.readthedocs.io/) sits + between the Zarr library and the object store, translating Zarr keys directly to object store keys. + + ```mermaid + flowchart TD + zarr-python[Zarr Library] <-- key / value--> icechunk[fsspec] + icechunk <-- key / value --> storage[(Object Storage)] + ``` + +- __Zarr + Icechunk__ + + --- + + With Icechunk, the Icechunk library intercepts the Zarr keys and translates them to the + Icechunk schema, storing data in object storage using its own format. + + ```mermaid + flowchart TD + zarr-python[Zarr Library] <-- key / value--> icechunk[Icechunk Library] + icechunk <-- icechunk data / metadata files --> storage[(Object Storage)] + ``` + +
+ + +Implementing Icechunk this way allows Icechunk's specification to evolve independently from Zarr's, +maintaining interoperability while enabling rapid iteration and promoting innovation on the I/O layer. + +## Is Icechunk part of the Zarr Spec? + +No. At the moment, the Icechunk spec is completely independent of the Zarr spec. + +In the future, we may choose to propose Icechunk as a Zarr extension. +However, because it sits _below_ Zarr in the stack, it's not immediately clear how to do that. + +## Should I implement Icechunk on my own based on the spec? + +No, we do not recommend implementing Icechunk independently of the existing Rust library. +There are two reasons for this: + +1. The spec has not yet been stabilized and is still evolving rapidly. +1. It's probably much easier to bind to the Rust library from your language of choice, + rather than re-implement from scratch. + +We welcome contributions from folks interested in developing Icechunk bindings for other languages! + +## Is Icechunk stable? + +The Icechunk library is reasonably well-tested and performant. +The Rust-based core library provides a solid foundation of correctness, safety, and speed. + +However, the actual on disk format is still evolving and may change from one alpha release to the next. +Until Icechunk reaches v1.0, we can't commit to long-term stability of the on-disk format. +This means Icechunk can't yet be used for production uses which require long-term persistence of data. + +😅 Don't worry! We are working as fast as we can and aim to release v1.0 soon! + +## Is Icechunk fast? + +We have not yet begun the process of optimizing Icechunk for performance. +Our focus so far has been on correctness and delivering the features needed for full interoperability with Zarr and Xarray. + +However, preliminary investigations indicate that Icechunk is at least as fast as the existing Zarr / Dask / fsspec stack +and in many cases achieves significantly lower latency and higher throughput. +Furthermore, Icechunk achieves this without using Dask, by implementing its own asynchronous multithreaded I/O pipeline. + +## How does Icechunk compare to X? + +### Array Formats + +Array formats are file formats for storing multi-dimensional array (tensor) data. +Icechunk is an array format. +Here is how Icechunk compares to other popular array formats. + +#### [HDF5](https://www.hdfgroup.org/solutions/hdf5/) + +HDF5 (Hierarchical Data Format version 5) is a popular format for storing scientific data. +HDF is widely used in high-performance computing. + +
+ +- __Similarities__ + + --- + + Icechunk and HDF5 share the same data model: multidimensional arrays and metadata organized into a hierarchical tree structure. + This data model can accommodate a wide range of different use cases and workflows. + + Both Icechunk and HDF5 use the concept of "chunking" to split large arrays into smaller storage units. + +- __Differences__ + + --- + + HDF5 is a monolithic file format designed first and foremost for POSIX filesystems. + All of the chunks in an HDF5 dataset live within a single file. + The size of an HDF5 dataset is limited to the size of a single file. + HDF5 relies on the filesystem for consistency and is not designed for multiple concurrent yet uncoordinated readers and writers. + + Icechunk spreads chunks over many files and is designed first and foremost for cloud object storage. + Icechunk can accommodate datasets of arbitrary size. + Icechunk's optimistic concurrency design allows for safe concurrent access for uncoordinated readers and writers. + +
+ +#### [NetCDF](https://www.unidata.ucar.edu/software/netcdf/) + +> NetCDF (Network Common Data Form) is a set of software libraries and machine-independent data formats that support the creation, access, and sharing of array-oriented scientific data. + +NetCDF4 uses HDF5 as its underlying file format. +Therefore, the similarities and differences with Icechunk are fundamentally the same. + +Icechunk can accommodate the NetCDF data model. +It's possible to write NetCDF compliant data in Icechunk using [Xarray](https://xarray.dev/). + +#### [Zarr](https://zarr.dev) + +Icechunk works together with Zarr. +(See [What is Icechunk's relationship to Zarr?](#what-is-icechunks-relationship-to-zarr) for more detail.) + +Compared to regular Zarr (without Icechunk), Icechunk offers many benefits, including + +- Serializable isolation of updates via transactions +- Data version control (snapshots, branches, tags) +- Ability to store references to chunks in external datasets (HDF5, NetCDF, GRIB, etc.) +- A Rust-optimized I/O layer for communicating with object storage + +#### [Cloud Optimized GeoTiff](http://cogeo.org/) (CoG) + +> A Cloud Optimized GeoTIFF (COG) is a regular GeoTIFF file, aimed at being hosted on a HTTP file server, with an internal organization that enables more efficient workflows on the cloud. +> It does this by leveraging the ability of clients issuing ​HTTP GET range requests to ask for just the parts of a file they need. + +CoG has become very popular in the geospatial community as a cloud-native format for raster data. +A CoG file contains a single image (possibly with multiple bands), sharded into chunks of an appropriate size. +A CoG also contains "overviews," lower resolution versions of the same data. +Finally, a CoG contains relevant geospatial metadata regarding projection, CRS, etc. which allow georeferencing of the data. + +Data identical to what is found in a CoG can be stored in the Zarr data model and therefore in an Icechunk repo. +Furthermore, Zarr / Icechunk can accommodate rasters of arbitrarily large size and facilitate massive-scale concurrent writes (in addition to reads); +A CoG, in contrast, is limited to a single file and thus has limitations on scale and write concurrency. + +However, Zarr and Icechunk currently do not offer the same level of broad geospatial interoperability that CoG does. +The [GeoZarr](https://github.com/zarr-developers/geozarr-spec) project aims to change that. + +#### [TileDB Embedded](https://docs.tiledb.com/main/background/key-concepts-and-data-format) + +TileDB Embedded is an innovative array storage format that bears many similarities to both Zarr and Icechunk. +Like TileDB Embedded, Icechunk aims to provide database-style features on top of the array data model. +Both technologies use an embedded / serverless architecture, where client processes interact directly with +data files in storage, rather than through a database server. +However, there are a number of difference, enumerated below. + +The following table compares Zarr + Icechunk with TileDB Embedded in a few key areas + +| feature | **Zarr + Icechunk** | **TileDB Embedded** | Comment | +|---------|---------------------|---------------------|---------| +| *atomicity* | atomic updates can span multiple arrays and groups | _array fragments_ limited to a single array | Icechunk's model allows a writer to stage many updates across interrelated arrays into a single transaction. | +| *concurrency and isolation* | serializable isolation of transactions | [eventual consistency](https://docs.tiledb.com/main/background/internal-mechanics/consistency) | While both formats enable lock-free concurrent reading and writing, Icechunk can catch (and potentially reject) inconsistent, out-of order updates. | +| *versioning* | snapshots, branches, tags | linear version history | Icechunk's data versioning model is closer to Git's. | +| *unit of storage* | chunk | tile | (basically the same thing) | +| *minimum write* | chunk | cell | TileDB allows atomic updates to individual cells, while Zarr requires writing an entire chunk. | +| *sparse arrays* | :material-close: | :material-check: | Zarr + Icechunk do not currently support sparse arrays. | +| *virtual chunk references* | :material-check: | :material-close: | Icechunk enables references to chunks in other file formats (HDF5, NetCDF, GRIB, etc.), while TileDB does not. | + +Beyond this list, there are numerous differences in the design, file layout, and implementation of Icechunk and TileDB embedded +which may lead to differences in suitability and performance for different workfloads. + +#### [SafeTensors](https://github.com/huggingface/safetensors) + +SafeTensors is a format developed by HuggingFace for storing tensors (arrays) safely, in contrast to Python pickle objects. + +By the same criteria Icechunk and Zarr are also "safe", in that it is impossible to trigger arbitrary code execution when reading data. + +SafeTensors is a single-file format, like HDF5, +SafeTensors optimizes for a simple on-disk layout that facilitates mem-map-based zero-copy reading in ML training pipelines, +assuming that the data are being read from a local POSIX filesystem +Zarr and Icechunk instead allow for flexible chunking and compression to optimize I/O against object storage. + +### Tabular Formats + +Tabular formats are for storing tabular data. +Tabular formats are extremely prevalent in general-purpose data analytics but are less widely used in scientific domains. +The tabular data model is different from Icechunk's multidimensional array data model, and so a direct comparison is not always apt. +However, Icechunk is inspired by many tabular file formats, and there are some notable similarities. + +#### [Apache Parquet](https://parquet.apache.org/) + +> Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. +> It provides high performance compression and encoding schemes to handle complex data in bulk and is supported in many programming language and analytics tools. + +Parquet employs many of the same core technological concepts used in Zarr + Icechunk such as chunking, compression, and efficient metadata access in a cloud context. +Both formats support a range of different numerical data types. +Both are "columnar" in the sense that different columns / variables / arrays can be queried efficiently without having to fetch unwanted data from other columns. +Both also support attaching arbitrary key-value metadata to variables. +Parquet supports "nested" types like variable-length lists, dicts, etc. that are currently unsupported in Zarr (but may be possible in the future). + +In general, Parquet and other tabular formats can't be substituted for Zarr / Icechunk, due to the lack of multidimensional array support. +On the other hand, tabular data can be modeled in Zarr / Icechunk in a relatively straightforward way: each column as a 1D array, and a table / dataframe as a group of same-sized 1D arrays. + +#### [Apache Iceberg](https://iceberg.apache.org/) + +> Iceberg is a high-performance format for huge analytic tables. +> Iceberg brings the reliability and simplicity of SQL tables to big data, while making it possible for engines like Spark, Trino, Flink, Presto, Hive and Impala to safely work with the same tables, at the same time. + +Iceberg is commonly used to manage many Parquet files as a single table in object storage. + +Iceberg was influential in the design of Icechunk. +Many of the [spec](./spec.md) core requirements are similar to Iceberg. +Specifically, both formats share the following properties: + +- Files written to object storage immutably +- All data and metadata files are tracked explicitly by manifests +- Similar mechanism for staging snapshots and committing transactions +- Support for branches and tags + +However, unlike Iceberg, Icechunk _does not require an external catalog_ to commit transactions; it relies solely on the consistency of the object store. + +#### [Delta Lake](https://delta.io/) + +Delta is another popular table format based on a log of updates to the table state. +Its functionality and design is quite similar to Iceberg, as is its comparison to Icechunk. + +#### [Lance](https://lancedb.github.io/lance/index.html) + +> Lance is a modern columnar data format that is optimized for ML workflows and datasets. + +Despite its focus on multimodal data, as a columnar format, Lance can't accommodate large arrays / tensors chunked over arbitrary dimensions, making it fundamentally different from Icechunk. + +However, the modern design of Lance was very influential on Icechunk. +Icechunk's commit and conflict resolution mechanism is partly inspired by Lance. + +### Other Related projects + +#### [Xarray](https://xarray.dev/) + +> Xarray is an open source project and Python package that introduces labels in the form of dimensions, coordinates, and attributes on top of raw NumPy-like arrays, which allows for more intuitive, more concise, and less error-prone user experience. +> +> Xarray includes a large and growing library of domain-agnostic functions for advanced analytics and visualization with these data structures. + +Xarray and Zarr / Icechunk work great together! +Xarray is the recommended way to read and write Icechunk data for Python users in geospatial, weather, climate, and similar domains. + +#### [Kerchunk](https://fsspec.github.io/kerchunk/) + +> Kerchunk is a library that provides a unified way to represent a variety of chunked, compressed data formats (e.g. NetCDF/HDF5, GRIB2, TIFF, …), allowing efficient access to the data from traditional file systems or cloud object storage. +> It also provides a flexible way to create virtual datasets from multiple files. It does this by extracting the byte ranges, compression information and other information about the data and storing this metadata in a new, separate object. +> This means that you can create a virtual aggregate dataset over potentially many source files, for efficient, parallel and cloud-friendly in-situ access without having to copy or translate the originals. +> It is a gateway to in-the-cloud massive data processing while the data providers still insist on using legacy formats for archival storage + +Kerchunk emerged from the [Pangeo](https://www.pangeo.io/) community as an experimental +way of reading archival files, allowing those files to be accessed "virtually" using the Zarr protocol. +Kerchunk pioneered the concept of a "chunk manifest", a file containing references to compressed binary chunks in other files in the form of the tuple `(uri, offset, size)`. +Kerchunk has experimented with different ways of serializing chunk manifests, including JSON and Parquet. + +Icechunk provides a highly efficient and scalable mechanism for storing and tracking the references generated by Kerchunk. +Kerchunk and Icechunk are highly complimentary. + +#### [VirtualiZarr](https://virtualizarr.readthedocs.io/en/latest/) + +> VirtualiZarr creates virtual Zarr stores for cloud-friendly access to archival data, using familiar Xarray syntax. + +VirtualiZarr provides another way of generating and manipulating Kerchunk-style references. +VirtualiZarr first uses Kerchunk to generate virtual references, but then provides a simple Xarray-based interface for manipulating those references. +As VirtualiZarr can also write virtual references into an Icechunk Store directly, together they form a complete pipeline for generating and storing references to multiple pre-existing files. + +#### [LakeFS](https://lakefs.io/) + +LakeFS is a solution git-style version control on top of cloud object storage. +LakeFS enables git-style commits, tags, and branches representing the state of an entire object storage bucket. + +LakeFS is format agnostic and can accommodate any type of data, including Zarr. +LakeFS can therefore be used to create a versioned Zarr store, similar to Icechunk. + +Icechunk, however, is designed specifically for array data, based on the Zarr data model. +This specialization enables numerous optimizations and user-experience enhancements not possible with LakeFS. + +LakeFS also requires a server to operate. +Icechunk, in contrast, works with just object storage. + +#### [TensorStore](https://google.github.io/tensorstore/index.html) + +> TensorStore is a library for efficiently reading and writing large multi-dimensional arrays. + +TensorStore can read and write a variety of different array formats, including Zarr. + +While TensorStore is not yet compatible with Icechunk, it should be possible to implement Icechunk support in TensorStore. + +TensorStore implements an [ocdbt](https://google.github.io/tensorstore/kvstore/ocdbt/index.html#ocdbt-key-value-store-driver): + +> The ocdbt driver implements an Optionally-Cooperative Distributed B+Tree (OCDBT) on top of a base key-value store. + +Ocdbt implements a transactional, versioned key-value store suitable for storing Zarr data, thereby supporting some of the same features as Icechunk. +Unlike Icechunk, the ocdbt key-value store is not specialized to Zarr, does not differentiate between chunk or metadata keys, and does not store any metadata about chunks. + + diff --git a/docs/docs/icechunk-python/concurrency.md b/docs/docs/icechunk-python/concurrency.md new file mode 100644 index 00000000..0fe06981 --- /dev/null +++ b/docs/docs/icechunk-python/concurrency.md @@ -0,0 +1,15 @@ +# Concurrency + +TODO: describe the general approach to concurrency in Icechunk + +## Built-in concurrency + +Describe the multi-threading and async concurrency in Icechunk / Zarr + +## Distributed concurrency within a single transaction + +"Cooperative" concurrency + +## Concurrency across uncoordinated sessions + +### Conflict detection \ No newline at end of file diff --git a/docs/docs/icechunk-python/configuration.md b/docs/docs/icechunk-python/configuration.md new file mode 100644 index 00000000..2fb2c224 --- /dev/null +++ b/docs/docs/icechunk-python/configuration.md @@ -0,0 +1,214 @@ +# Configuration + +When creating and opening Icechunk stores, there are a two different sets of configuration to be aware of: + +- [`StorageConfig`](./reference.md#icechunk.StorageConfig) - for configuring access to the object store or filesystem +- [`StoreConfig`](./reference.md#icechunk.StoreConfig) - for configuring the behavior of the Icechunk Store itself + +## Storage Config + +Icechunk can be confirgured to work with a both object storage and filesystem backends. The storage configuration defines the location of an Icechunk store, along with any options or information needed to access data from a given storage type. + +### S3 Storage + +When using Icechunk with s3 compatible storage systems, credentials must be provided to allow access to the data on the given endpoint. Icechunk allows for creating the storage config for s3 in three ways: + +=== "From environment" + + With this option, the credentials for connecting to S3 are detected automatically from your environment. + This is usually the best choice if you are connecting from within an AWS environment (e.g. from EC2). [See the API](./reference.md#icechunk.StorageConfig.s3_from_env) + + ```python + icechunk.StorageConfig.s3_from_env( + bucket="icechunk-test", + prefix="quickstart-demo-1" + ) + ``` + +=== "Provide credentials" + + With this option, you provide your credentials and other details explicitly. [See the API](./reference.md#icechunk.StorageConfig.s3_from_config) + + ```python + icechunk.StorageConfig.s3_from_config( + bucket="icechunk-test", + prefix="quickstart-demo-1", + region='us-east-1', + credentials=S3Credentials( + access_key_id='my-access-key', + secret_access_key='my-secret-key', + # session token is optional + session_token='my-token', + ), + endpoint_url=None, + allow_http=False, + ) + ``` + +=== "Anonymous" + + With this option, you connect to S3 anonymously (without credentials). + This is suitable for public data. [See the API](./reference.md#icechunk.StorageConfig.s3_anonymous) + + ```python + icechunk.StorageConfig.s3_anonymous( + bucket="icechunk-test", + prefix="quickstart-demo-1", + region='us-east-1, + ) + ``` + +### Filesystem Storage + +Icechunk can also be used on a [local filesystem](./reference.md#icechunk.StorageConfig.filesystem) by providing a path to the location of the store + +=== "Local filesystem" + + ```python + icechunk.StorageConfig.filesystem("/path/to/my/dataset") + ``` + +## Store Config + +Separate from the storage config, the Store can also be configured with options which control its runtime behavior. + +### Writing chunks inline + +Chunks can be written inline alongside the store metadata if the size of a given chunk falls within the configured threshold. +Inlining allows these small chunks (often used to store small coordinate variables) to be accessed more quickly. +This is the default behavior for chunks smaller than 512 bytes, but it can be overridden using the `inline_chunk_threshold_bytes` option: + +=== "Never write chunks inline" + + ```python + StoreConfig( + inline_chunk_threshold_bytes=0, + ... + ) + ``` + +=== "Write bigger chunks inline" + + ```python + StoreConfig( + inline_chunk_threshold_bytes=1024, + ... + ) + ``` + +### Virtual Reference Storage Config + +Icechunk allows for reading "Virtual" data from [existing archival datasets](./virtual.md). This requires creating a distinct `VirtualRefConfig` (similar to `StorageConfig`) giving Icechunk the necessary permissions to access the archival data. This can be configured using the `virtual_ref_config` option: + +=== "S3 from environment" + + ```python + StoreConfig( + virtual_ref_config=VirtualRefConfig.s3_from_env(), + ... + ) + ``` + +=== "S3 with credentials" + + ```python + StoreConfig( + virtual_ref_config=VirtualRefConfig.s3_from_config( + credential=S3Credentials( + access_key_id='my-access-key', + secret_access_key='my-secret-key', + ), + region='us-east-1' + ), + ... + ) + ``` + +=== "S3 Anonymous" + + ```python + StoreConfig( + virtual_ref_config=VirtualRefConfig.s3_anonymous(region='us-east-1'), + ... + ) + ``` + +## Creating and Opening Repos + +Now we can now create or open an Icechunk store using our config. + +### Creating a new store + +!!! note + + Icechunk stores cannot be created in the same location where another store already exists. + +=== "Creating with S3 storage" + + ```python + storage = icechunk.StorageConfig.s3_from_env( + bucket='earthmover-sample-data', + prefix='icechunk/oisst.2020-2024/', + region='us-east-1', + ) + + store = icechunk.IcechunkStore.create( + storage=storage, + mode="w", + ) + ``` + +=== "Creating with local filesystem" + + ```python + storage = icechunk.StorageConfig.filesystem("/path/to/my/dataset") + config = icechunk.StoreConfig( + inline_chunk_threshold_bytes=1024, + ) + + store = icechunk.IcechunkStore.create( + storage=storage, + mode="w", + ) + ``` + +### Opening an existing store + +=== "Opening from S3 Storage" + + ```python + storage = icechunk.StorageConfig.s3_anonymous( + bucket='earthmover-sample-data', + prefix='icechunk/oisst.2020-2024/', + region='us-east-1', + ) + + config = icechunk.StoreConfig( + virtual_ref_config=icechunk.VirtualRefConfig.s3_anonymous(region='us-east-1'), + ) + + store = icechunk.IcechunkStore.open_existing( + storage=storage, + mode="r+", + config=config, + ) + ``` + +=== "Opening from local filesystem" + + ```python + storage = icechunk.StorageConfig.filesystem("/path/to/my/dataset") + config = icechunk.StoreConfig( + inline_chunk_threshold_bytes=1024, + ) + + store = icechunk.IcechunkStore.open_existing( + storage=storage, + mode='r+', + config=config, + ) + ``` + +#### Access Mode + +Note that in all of the above examples, a `mode` is provided to instruct the access level of the user to the store. This mode instructs whether the store should be opened in read only mode, and the store should start with a clean slate (although Icechunk prevents the possibility of accidentally overwriting any data that was previously comimtted to the store forever). For more about the access modes, see the [`zarr-python` docs](https://zarr.readthedocs.io/en/v3/_autoapi/zarr/abc/store/index.html#zarr.abc.store.AccessMode). \ No newline at end of file diff --git a/docs/docs/icechunk-python/index.md b/docs/docs/icechunk-python/index.md new file mode 100644 index 00000000..0317b3c2 --- /dev/null +++ b/docs/docs/icechunk-python/index.md @@ -0,0 +1,7 @@ +# Index of icechunk-python + +- [developing](/icechunk-python/developing/) +- [examples](/icechunk-python/examples/) +- [notebooks](/icechunk-python/notebooks/) +- [quickstart](/icechunk-python/quickstart/) +- [reference](/icechunk-python/reference/) diff --git a/docs/docs/icechunk-python/quickstart.md b/docs/docs/icechunk-python/quickstart.md new file mode 100644 index 00000000..d05d78d5 --- /dev/null +++ b/docs/docs/icechunk-python/quickstart.md @@ -0,0 +1,113 @@ +# Quickstart + +Icechunk is designed to be mostly in the background. +As a Python user, you'll mostly be interacting with Zarr. +If you're not familiar with Zarr, you may want to start with the [Zarr Tutorial](https://zarr.readthedocs.io/en/latest/tutorial.html) + +## Installation + +Install Icechunk with pip + +```python +pip install icechunk +``` + +!!! note + + Icechunk is currently designed to support the [Zarr V3 Specification](https://zarr-specs.readthedocs.io/en/latest/v3/core/v3.0.html). + Using it today requires installing the latest pre-release of Zarr Python 3. + + +## Create a new store + +To get started, let's create a new Icechunk store. +We recommend creating your store on S3 to get the most out of Icechunk's cloud-native design. +However, you can also create a store on your local filesystem. + +=== "S3 Storage" + + ```python + storage_config = icechunk.StorageConfig.s3_from_env( + bucket="icechunk-test", + prefix="quickstart-demo-1" + ) + store = icechunk.IcechunkStore.create(storage_config) + ``` + +=== "Local Storage" + + ```python + storage_config = icechunk.StorageConfig.filesystem("./icechunk-local") + store = icechunk.IcechunkStore.create(storage_config) + ``` + +## Write some data and commit + +We can now use our Icechunk `store` with Zarr. +Let's first create a group and an array within it. + +```python +group = zarr.group(store) +array = group.create("my_array", shape=10, dtype=int) +``` + +Now let's write some data + +```python +array[:] = 1 +``` + +Now let's commit our update + +```python +store.commit("first commit") +``` + +🎉 Congratulations! You just made your first Icechunk snapshot. + +## Make a second commit + +Let's now put some new data into our array, overwriting the first five elements. + +```python +array[:5] = 2 +``` + +...and commit the changes + +```python +store.commit("overwrite some values") +``` + +## Explore version history + +We can see the full version history of our repo: + +```python +hist = store.ancestry() +for anc in hist: + print(anc.id, anc.message, anc.written_at) + +# Output: +# AHC3TSP5ERXKTM4FCB5G overwrite some values 2024-10-14 14:07:27.328429+00:00 +# Q492CAPV7SF3T1BC0AA0 first commit 2024-10-14 14:07:26.152193+00:00 +# T7SMDT9C5DZ8MP83DNM0 Repository initialized 2024-10-14 14:07:22.338529+00:00 +``` + +...and we can go back in time to the earlier version. + +```python +# latest version +assert array[0] == 2 +# check out earlier snapshot +store.checkout(snapshot_id=hist[1].id) +# verify data matches first version +assert array[0] == 1 +``` + +--- + +That's it! You now know how to use Icechunk! +For your next step, dig deeper into [configuration](./configuration.md), +explore the [version control system](./version-control.md), or learn how to +[use Icechunk with Xarray](./xarray.md). \ No newline at end of file diff --git a/docs/docs/icechunk-python/reference.md b/docs/docs/icechunk-python/reference.md new file mode 100644 index 00000000..5eab2a5f --- /dev/null +++ b/docs/docs/icechunk-python/reference.md @@ -0,0 +1 @@ +::: icechunk diff --git a/docs/docs/icechunk-python/version-control.md b/docs/docs/icechunk-python/version-control.md new file mode 100644 index 00000000..410fe9f0 --- /dev/null +++ b/docs/docs/icechunk-python/version-control.md @@ -0,0 +1,6 @@ +# Version Control + +COMING SOON! + +In the meantime, you can read about [version control in Arraylake](https://docs.earthmover.io/arraylake/version-control), +which is very similar to version control in Icechunk. \ No newline at end of file diff --git a/docs/docs/icechunk-python/virtual.md b/docs/docs/icechunk-python/virtual.md new file mode 100644 index 00000000..8e2e9ef0 --- /dev/null +++ b/docs/docs/icechunk-python/virtual.md @@ -0,0 +1,213 @@ +# Virtual Datasets + +While Icechunk works wonderfully with native chunks managed by Zarr, there is lots of archival data out there in other formats already. To interoperate with such data, Icechunk supports "Virtual" chunks, where any number of chunks in a given dataset may reference external data in existing archival formats, such as netCDF, HDF, GRIB, or TIFF. Virtual chunks are loaded directly from the original source without copying or modifying the original achival data files. This enables Icechunk to manage large datasets from existing data without needing that data to be in Zarr format already. + +!!! warning + + While virtual references are fully supported in Icechunk, creating virtual datasets currently relies on using experimental or pre-release versions of open source tools. For full instructions on how to install the required tools and their current statuses [see the tracking issue on Github](https://github.com/earth-mover/icechunk/issues/197). + With time, these experimental features will make their way into the released packages. + +To create virtual Icechunk datasets with Python, the community utilizes the [kerchunk](https://fsspec.github.io/kerchunk/) and [VirtualiZarr](https://virtualizarr.readthedocs.io/en/latest/) packages. + +`kerchunk` allows scanning the metadata of existing data files to extract virtual references. It also provides methods to combine these references into [larger virtual datasets](https://fsspec.github.io/kerchunk/tutorial.html#combine-multiple-kerchunked-datasets-into-a-single-logical-aggregate-dataset), which can be exported to it's [reference format](https://fsspec.github.io/kerchunk/spec.html). + +`VirtualiZarr` lets users ingest existing data files into virtual datasets using various different tools under the hood, including `kerchunk`, `xarray`, `zarr`, and now `icechunk`. It does so by creating virtual references to existing data that can be combined and manipulated to create larger virtual datasets using `xarray`. These datasets can then be exported to `kerchunk` reference format or to an `Icechunk` store, without ever copying or moving the existing data files. + +## Creating a virtual dataset with VirtualiZarr + +We are going to create a virtual dataset pointing to all of the [OISST](https://www.ncei.noaa.gov/products/optimum-interpolation-sst) data for August 2024. This data is distributed publicly as netCDF files on AWS S3, with one netCDF file containing the Sea Surface Temperature (SST) data for each day of the month. We are going to use `VirtualiZarr` to combine all of these files into a single virtual dataset spanning the entire month, then write that dataset to Icechunk for use in analysis. + +!!! note + + At this point you should have followed the instructions [here](https://github.com/earth-mover/icechunk/issues/197) to install the necessary experimental dependencies. + +Before we get started, we also need to install `fsspec` and `s3fs` for working with data on s3. + +```shell +pip install fsspec s3fs +``` + +First, we need to find all of the files we are interested in, we will do this with fsspec using a `glob` expression to find every netcdf file in the August 2024 folder in the bucket: + +```python +import fsspec + +fs = fsspec.filesystem('s3') + +oisst_files = fs.glob('s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/202408/oisst-avhrr-v02r01.*.nc') + +oisst_files = sorted(['s3://'+f for f in oisst_files]) +#['s3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/201001/oisst-avhrr-v02r01.20100101.nc', +# 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/201001/oisst-avhrr-v02r01.20100102.nc', +# 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/201001/oisst-avhrr-v02r01.20100103.nc', +# 's3://noaa-cdr-sea-surface-temp-optimum-interpolation-pds/data/v2.1/avhrr/201001/oisst-avhrr-v02r01.20100104.nc', +#... +#] +``` + +Now that we have the filenames of the data we need, we can create virtual datasets with `VirtualiZarr`. This may take a minute. + +```python +from virtualizarr import open_virtual_dataset + +virtual_datasets =[ + open_virtual_dataset(url, indexes={}) + for url in oisst_files +] +``` + +We can now use `xarray` to combine these virtual datasets into one large virtual dataset (For more details on this operation see [`VirtualiZarr`'s documentation](https://virtualizarr.readthedocs.io/en/latest/usage.html#combining-virtual-datasets)). We know that each of our files share the same structure but with a different date. So we are going to concatenate these datasets on the `time` dimension. + +```python +import xarray as xr + +virtual_ds = xr.concat( + virtual_datasets, + dim='time', + coords='minimal', + compat='override', + combine_attrs='override' +) + +# Size: 257MB +#Dimensions: (time: 31, zlev: 1, lat: 720, lon: 1440) +#Coordinates: +# time (time) float32 124B ManifestArray Size: 1GB +#Dimensions: (lon: 1440, time: 31, zlev: 1, lat: 720) +#Coordinates: +# * lon (lon) float32 6kB 0.125 0.375 0.625 0.875 ... 359.4 359.6 359.9 +# * zlev (zlev) float32 4B 0.0 +# * time (time) datetime64[ns] 248B 2024-08-01T12:00:00 ... 2024-08-31T12... +# * lat (lat) float32 3kB -89.88 -89.62 -89.38 -89.12 ... 89.38 89.62 89.88 +#Data variables: +# sst (time, zlev, lat, lon) float64 257MB dask.array +# ice (time, zlev, lat, lon) float64 257MB dask.array +# anom (time, zlev, lat, lon) float64 257MB dask.array +# err (time, zlev, lat, lon) float64 257MB dask.array +``` + +Success! We have created our full dataset with 31 timesteps spanning the month of august, all with virtual references to pre-existing data files in object store. This means we can now version control our dataset, allowing us to update it, and roll it back to a previous version without copying or moving any data from the original files. + +Finally, let's make a plot of the sea surface temperature! + +```python +ds.sst.isel(time=26, zlev=0).plot(x='lon', y='lat', vmin=0) +``` + +![oisst](../assets/datasets/oisst.png) + +## Virtual Reference API + +While `VirtualiZarr` is the easiest way to create virtual datasets with Icechunk, the Store API that it uses to create the datasets in Icechunk is public. `IcechunkStore` contains a [`set_virtual_ref`](./reference.md#icechunk.IcechunkStore.set_virtual_ref) method that specifies a virtual ref for a specified chunk. + +### Virtual Reference Storage Support + +Currently, Icechunk supports two types of storage for virtual references: + +#### S3 Compatible + +References to files accessible via S3 compatible storage. + +##### Example + +Here is how we can set the chunk at key `c/0` to point to a file on an s3 bucket,`mybucket`, with the prefix `my/data/file.nc`: + +```python +store.set_virtual_ref('c/0', 's3://mybucket/my/data/file.nc', offset=1000, length=200) +``` + +##### Configuration + +S3 virtual references require configuring credential for the store to be able to access the specified s3 bucket. See [the configuration docs](./configuration.md#virtual-reference-storage-config) for instructions. + + +#### Local Filesystem + +References to files accessible via local filesystem. This requires any file paths to be **absolute** at this time. + +##### Example + +Here is how we can set the chunk at key `c/0` to point to a file on my local filesystem located at `/path/to/my/file.nc`: + +```python +store.set_virtual_ref('c/0', 'file:///path/to/my/file.nc', offset=20, length=100) +``` + +No extra configuration is necessary for local filesystem references. + +### Virtual Reference File Format Support + +Currently, Icechunk supports `HDF5` and `netcdf4` files for use in virtual references. See the [tracking issue](https://github.com/earth-mover/icechunk/issues/197) for more info. diff --git a/docs/docs/icechunk-python/xarray.md b/docs/docs/icechunk-python/xarray.md new file mode 100644 index 00000000..8ba6666e --- /dev/null +++ b/docs/docs/icechunk-python/xarray.md @@ -0,0 +1,171 @@ +# Icechunk + Xarray + +Icechunk was designed to work seamlessly with Xarray. Xarray users can read and +write data to Icechunk using [`xarray.open_zarr`](https://docs.xarray.dev/en/latest/generated/xarray.open_zarr.html#xarray.open_zarr) +and [`xarray.Dataset.to_zarr`](https://docs.xarray.dev/en/latest/generated/xarray.Dataset.to_zarr.html#xarray.Dataset.to_zarr). + +!!! warning + + Using Xarray and Icechunk together currently requires installing Xarray from source. + + ```shell + pip install git+https://github.com/pydata/xarray@zarr-v3 + ``` + + We expect this functionality to be included in Xarray's next release. + +In this example, we'll explain how to create a new Icechunk store, write some sample data +to it, and append data a second block of data using Icechunk's version control features. + +## Create a new store + +Similar to the example in [quickstart](/icechunk-python/quickstart/), we'll create an +Icechunk store in S3 or a local file system. You will need to replace the `StorageConfig` +with a bucket or file path that you have access to. + +```python +import xarray as xr +from icechunk import IcechunkStore, StorageConfig +``` + +=== "S3 Storage" + + ```python + storage_config = StorageConfig.s3_from_env( + bucket="icechunk-test", + prefix="xarray-demo" + ) + store = IcechunkStore.create(storage_config) + ``` + +=== "Local Storage" + + ```python + storage_config = StorageConfig.filesystem("./icechunk-xarray") + store = IcechunkStore.create(storage_config) + ``` + +## Open tutorial dataset from Xarray + +For this demo, we'll open Xarray's RASM tutorial dataset and split it into two blocks. +We'll write the two blocks to Icechunk in separate transactions later in the this example. + + +!!! note + + Downloading xarray tutorial data requires pooch and netCDF4. These can be installed with + + ```shell + pip install pooch netCDF4 + ``` + +```python +ds = xr.tutorial.open_dataset('rasm') + +ds1 = ds.isel(time=slice(None, 18)) # part 1 +ds2 = ds.isel(time=slice(18, None)) # part 2 +``` + +## Write Xarray data to Icechunk + +Writing Xarray data to Icechunk is as easy as calling `Dataset.to_zarr`: + +```python +ds1.to_zarr(store, zarr_format=3, consolidated=False) +``` + +!!! note + + 1. [Consolidated metadata](https://docs.xarray.dev/en/latest/user-guide/io.html#consolidated-metadata) + is unnecessary (and unsupported) in Icechunk. + Icechunk already organizes the dataset metadata in a way that makes it very + fast to fetch from storage. + 2. `zarr_format=3` is required until the default Zarr format changes in Xarray. + +After writing, we commit the changes: + +```python +store.commit("add RASM data to store") +# output: 'ME4VKFPA5QAY0B2YSG8G' +``` + +## Append to an existing store + +Next, we want to add a second block of data to our store. Above, we created `ds2` for just +this reason. Again, we'll use `Dataset.to_zarr`, this time with `append_dim='time'`. + +```python +ds2.to_zarr(store, append_dim='time') +``` + +And then we'll commit the changes: + +```python +store.commit("append more data") +# output: 'WW4V8V34QCZ2NXTD5DXG' +``` + +## Reading data with Xarray + +To read data stored in Icechunk with Xarray, we'll use `xarray.open_zarr`: + +```python +xr.open_zarr(store, consolidated=False) +# output: Size: 17MB +# Dimensions: (time: 36, y: 205, x: 275) +# Coordinates: +# * time (time) object 288B 1980-09-16 12:00:00 ... 1983-08-17 00:00:00 +# xc (y, x) float64 451kB dask.array +# yc (y, x) float64 451kB dask.array +# Dimensions without coordinates: y, x +# Data variables: +# Tair (time, y, x) float64 16MB dask.array +# Attributes: +# NCO: netCDF Operators version 4.7.9 (Homepage = htt... +# comment: Output from the Variable Infiltration Capacity... +# convention: CF-1.4 +# history: Fri Aug 7 17:57:38 2020: ncatted -a bounds,,d... +# institution: U.W. +# nco_openmp_thread_number: 1 +# output_frequency: daily +# output_mode: averaged +# references: Based on the initial model of Liang et al., 19... +# source: RACM R1002RBRxaaa01a +# title: /workspace/jhamman/processed/R1002RBRxaaa01a/l... +``` + +We can also read data from previous snapshots by checking out prior versions: + +```python +store.checkout(snapshot_id='ME4VKFPA5QAY0B2YSG8G') + +xr.open_zarr(store, consolidated=False) +# Size: 9MB +# Dimensions: (time: 18, y: 205, x: 275) +# Coordinates: +# xc (y, x) float64 451kB dask.array +# yc (y, x) float64 451kB dask.array +# * time (time) object 144B 1980-09-16 12:00:00 ... 1982-02-15 12:00:00 +# Dimensions without coordinates: y, x +# Data variables: +# Tair (time, y, x) float64 8MB dask.array +# Attributes: +# NCO: netCDF Operators version 4.7.9 (Homepage = htt... +# comment: Output from the Variable Infiltration Capacity... +# convention: CF-1.4 +# history: Fri Aug 7 17:57:38 2020: ncatted -a bounds,,d... +# institution: U.W. +# nco_openmp_thread_number: 1 +# output_frequency: daily +# output_mode: averaged +# references: Based on the initial model of Liang et al., 19... +# source: RACM R1002RBRxaaa01a +# title: /workspace/jhamman/processed/R1002RBRxaaa01a/l... +``` + +Notice that this second `xarray.Dataset` has a time dimension of length 18 whereas the +first has a time dimension of length 36. + +## Next steps + +For more details on how to use Xarray's Zarr integration, checkout [Xarray's documentation](https://docs.xarray.dev/en/stable/user-guide/io.html#zarr). diff --git a/docs/docs/icechunk-rust.md b/docs/docs/icechunk-rust.md new file mode 100644 index 00000000..12c10c85 --- /dev/null +++ b/docs/docs/icechunk-rust.md @@ -0,0 +1,14 @@ +--- +title: Rust +--- + +# Icechunk Rust + +The Icechunk rust library is used internally by Icechunk Python. +It is currently not designed to be used in standalone form. + +- [Icechunk Rust Documentatio](https://docs.rs/icechunk/latest/icechunk/) at docs.rs + +We welcome contributors interested in implementing more Rust functionality! +In particular, we would love to integrate Icechunk with [zarrs](https://docs.rs/zarrs/latest/zarrs/), +a new Zarr Rust library. \ No newline at end of file diff --git a/docs/docs/index.md b/docs/docs/index.md new file mode 100644 index 00000000..dfdbbc09 --- /dev/null +++ b/docs/docs/index.md @@ -0,0 +1,4 @@ +--- +template: home.html +title: Icechunk - Open-source, cloud-native transactional tensor storage engine +--- diff --git a/docs/docs/overview.md b/docs/docs/overview.md new file mode 100644 index 00000000..b75d30f9 --- /dev/null +++ b/docs/docs/overview.md @@ -0,0 +1,141 @@ +--- +title: Overview +--- +# Icechunk + +Icechunk is an open-source (Apache 2.0), transactional storage engine for tensor / ND-array data designed for use on cloud object storage. +Icechunk works together with **[Zarr](https://zarr.dev/)**, augmenting the Zarr core data model with features +that enhance performance, collaboration, and safety in a cloud-computing context. + +## Docs Organization + +This is the Icechunk documentation. It's organized into the following parts. + +- This page: a general overview of the project's goals and components. +- [Frequently Asked Questions](./faq.md) +- Documentation for [Icechunk Python](./icechunk-python), the main user-facing + library +- Documentation for the [Icechunk Rust Crate](./icechunk-rust.md) +- The [Icechunk Spec](./spec.md) + +## Icechunk Overview + +Let's break down what "transactional storage engine for Zarr" actually means: + +- **[Zarr](https://zarr.dev/)** is an open source specification for the storage of multidimensional array (a.k.a. tensor) data. + Zarr defines the metadata for describing arrays (shape, dtype, etc.) and the way these arrays are chunked, compressed, and converted to raw bytes for storage. Zarr can store its data in any key-value store. + There are many different implementations of Zarr in different languages. _Right now, Icechunk only supports + [Zarr Python](https://zarr.readthedocs.io/en/stable/)._ + If you're interested in implementing Icehcunk support, please [open an issue](https://github.com/earth-mover/icechunk/issues) so we can help you. +- **Storage engine** - Icechunk exposes a key-value interface to Zarr and manages all of the actual I/O for getting, setting, and updating both metadata and chunk data in cloud object storage. + Zarr libraries don't have to know exactly how icechunk works under the hood in order to use it. +- **Transactional** - The key improvement that Icechunk brings on top of regular Zarr is to provide consistent serializable isolation between transactions. + This means that Icechunk data are safe to read and write in parallel from multiple uncoordinated processes. + This allows Zarr to be used more like a database. + +The core entity in Icechunk is a repository or **repo**. +A repo is defined as a Zarr hierarchy containing one or more Arrays and Groups, and a repo functions as +self-contained _Zarr Store_. +The most common scenario is for an Icechunk repo to contain a single Zarr group with multiple arrays, each corresponding to different physical variables but sharing common spatiotemporal coordinates. +However, formally a repo can be any valid Zarr hierarchy, from a single Array to a deeply nested structure of Groups and Arrays. +Users of Icechunk should aim to scope their repos only to related arrays and groups that require consistent transactional updates. + +Icechunk supports the following core requirements: + +1. **Object storage** - the format is designed around the consistency features and performance characteristics available in modern cloud object storage. No external database or catalog is required to maintain a repo. +(It also works with file storage.) +1. **Serializable isolation** - Reads are isolated from concurrent writes and always use a committed snapshot of a repo. Writes are committed atomically and are never partially visible. No locks are required for reading. +1. **Time travel** - Previous snapshots of a repo remain accessible after new ones have been written. +1. **Data version control** - Repos support both _tags_ (immutable references to snapshots) and _branches_ (mutable references to snapshots). +1. **Chunk shardings** - Chunk storage is decoupled from specific file names. Multiple chunks can be packed into a single object (sharding). +1. **Chunk references** - Zarr-compatible chunks within other file formats (e.g. HDF5, NetCDF) can be referenced. +1. **Schema evolution** - Arrays and Groups can be added, renamed, and removed from the hierarchy with minimal overhead. + +## Key Concepts + +### Groups, Arrays, and Chunks + +Icechunk is designed around the Zarr data model, widely used in scientific computing, data science, and AI / ML. +(The Zarr high-level data model is effectively the same as HDF5.) +The core data structure in this data model is the **array**. +Arrays have two fundamental properties: + +- **shape** - a tuple of integers which specify the dimensions of each axis of the array. A 10 x 10 square array would have shape (10, 10) +- **data type** - a specification of what type of data is found in each element, e.g. integer, float, etc. Different data types have different precision (e.g. 16-bit integer, 64-bit float, etc.) + +In Zarr / Icechunk, arrays are split into **chunks**, +A chunk is the minimum unit of data that must be read / written from storage, and thus choices about chunking have strong implications for performance. +Zarr leaves this completely up to the user. +Chunk shape should be chosen based on the anticipated data access pattern for each array +An Icechunk array is not bounded by an individual file and is effectively unlimited in size. + +For further organization of data, Icechunk supports **groups** within a single repo. +Group are like folders which contain multiple arrays and or other groups. +Groups enable data to be organized into hierarchical trees. +A common usage pattern is to store multiple arrays in a group representing a NetCDF-style dataset. + +Arbitrary JSON-style key-value metadata can be attached to both arrays and groups. + +### Snapshots + +Every update to an Icechunk store creates a new **snapshot** with a unique ID. +Icechunk users must organize their updates into groups of related operations called **transactions**. +For example, appending a new time slice to multiple arrays should be done as a single transaction, comprising the following steps +1. Update the array metadata to resize the array to accommodate the new elements. +2. Write new chunks for each array in the group. + +While the transaction is in progress, none of these changes will be visible to other users of the store. +Once the transaction is committed, a new snapshot is generated. +Readers can only see and use committed snapshots. + +### Branches and Tags + +Additionally, snapshots occur in a specific linear (i.e. serializable) order within **branch**. +A branch is a mutable reference to a snapshot--a pointer that maps the branch name to a snapshot ID. +The default branch is `main`. +Every commit to the main branch updates this reference. +Icechunk's design protects against the race condition in which two uncoordinated sessions attempt to update the branch at the same time; only one can succeed. + +Icechunk also defines **tags**--_immutable_ references to snapshot. +Tags are appropriate for publishing specific releases of a repository or for any application which requires a persistent, immutable identifier to the store state. + +### Chunk References + +Chunk references are "pointers" to chunks that exist in other files--HDF5, NetCDF, GRIB, etc. +Icechunk can store these references alongside native Zarr chunks as "virtual datasets". +You can then can update these virtual datasets incrementally (overwrite chunks, change metadata, etc.) without touching the underling files. + +## How Does It Work? + +!!! note + For more detailed explanation, have a look at the [Icechunk spec](./spec.md) + +Zarr itself works by storing both metadata and chunk data into a abstract store according to a specified system of "keys". +For example, a 2D Zarr array called `myarray`, within a group called `mygroup`, would generate the following keys: + +``` +mygroup/zarr.json +mygroup/myarray/zarr.json +mygroup/myarray/c/0/0 +mygroup/myarray/c/0/1 +``` + +In standard regular Zarr stores, these key map directly to filenames in a filesystem or object keys in an object storage system. +When writing data, a Zarr implementation will create these keys and populate them with data. When modifying existing arrays or groups, a Zarr implementation will potentially overwrite existing keys with new data. + +This is generally not a problem, as long there is only one person or process coordinating access to the data. +However, when multiple uncoordinated readers and writers attempt to access the same Zarr data at the same time, [various consistency problems](https://docs.earthmover.io/concepts/version-control-system#consistency-problems-with-zarr) problems emerge. +These consistency problems can occur in both file storage and object storage; they are particularly severe in a cloud setting where Zarr is being used as an active store for data that are frequently changed while also being read. + +With Icechunk, we keep the same core Zarr data model, but add a layer of indirection between the Zarr keys and the on-disk storage. +The Icechunk library translates between the Zarr keys and the actual on-disk data given the particular context of the user's state. +Icechunk defines a series of interconnected metadata and data files that together enable efficient isolated reading and writing of metadata and chunks. +Once written, these files are immutable. +Icechunk keeps track of every single chunk explicitly in a "chunk manifest". + +```mermaid +flowchart TD + zarr-python[Zarr Library] <-- key / value--> icechunk[Icechunk Library] + icechunk <-- data / metadata files --> storage[(Object Storage)] +``` + diff --git a/docs/docs/sample-datasets.md b/docs/docs/sample-datasets.md new file mode 100644 index 00000000..92e63762 --- /dev/null +++ b/docs/docs/sample-datasets.md @@ -0,0 +1,31 @@ +--- +title: Sample Datasets +--- +# Sample Datasets + +## Native Datasets + + +## Virtual Datasets + +### NOAA [OISST](https://www.ncei.noaa.gov/products/optimum-interpolation-sst) Data + +> The NOAA 1/4° Daily Optimum Interpolation Sea Surface Temperature (OISST) is a long term Climate Data Record that incorporates observations from different platforms (satellites, ships, buoys and Argo floats) into a regular global grid + +Check out an example dataset built using all virtual references pointing to daily Sea Surface Temperature data from 2020 to 2024 on NOAA's S3 bucket using python: + +```python +import icechunk + +storage = icechunk.StorageConfig.s3_anonymous( + bucket='earthmover-sample-data', + prefix='icechunk/oisst.2020-2024/', + region='us-east-1', +) + +store = IcechunkStore.open_existing(storage=storage, mode="r", config=StoreConfig( + virtual_ref_config=VirtualRefConfig.s3_anonymous(region='us-east-1'), +)) +``` + +![oisst](./assets/datasets/oisst.png) \ No newline at end of file diff --git a/docs/docs/spec.md b/docs/docs/spec.md new file mode 100644 index 00000000..3347776b --- /dev/null +++ b/docs/docs/spec.md @@ -0,0 +1,348 @@ +--- +title: Specification +--- +# Icechunk Specification + +**!!! Note:** + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [RFC 2119](https://www.rfc-editor.org/rfc/rfc2119.html). + +## Introduction + +The Icechunk specification is a storage specification for [Zarr](https://zarr-specs.readthedocs.io/en/latest/specs.html) data. +Icechunk is inspired by Apache Iceberg and borrows many concepts and ideas from the [Iceberg Spec](https://iceberg.apache.org/spec/#version-2-row-level-deletes). + +This specification describes a single Icechunk **repository**. +A repository is defined as a Zarr store containing one or more Arrays and Groups. +The most common scenario is for a repository to contain a single Zarr group with multiple arrays, each corresponding to different physical variables but sharing common spatiotemporal coordinates. +However, formally a repository can be any valid Zarr hierarchy, from a single Array to a deeply nested structure of Groups and Arrays. +Users of Icechunk should aim to scope their repository only to related arrays and groups that require consistent transactional updates. + +Icechunk defines a series of interconnected metadata and data files that together comprise the format. +All the data and metadata for a repository are stored in a directory in object storage or file storage. + +## Goals + +The goals of the specification are as follows: + +1. **Object storage** - the format is designed around the consistency features and performance characteristics available in modern cloud object storage. No external database or catalog is required. +1. **Serializable isolation** - Reads will be isolated from concurrent writes and always use a committed snapshot of a repository. Writes to repositories will be committed atomically and will not be partially visible. Readers will not acquire locks. +1. **Time travel** - Previous snapshots of a repository remain accessible after new ones have been written. +1. **Chunk sharding and references** - Chunk storage is decoupled from specific file names. Multiple chunks can be packed into a single object (sharding). Zarr-compatible chunks within other file formats (e.g. HDF5, NetCDF) can be referenced. +1. **Schema Evolution** - Arrays and Groups can be added, renamed, and removed from the hierarchy with minimal overhead. + +### Non Goals + +1. **Low Latency** - Icechunk is designed to support analytical workloads for large repositories. We accept that the extra layers of metadata files and indirection will introduce additional cold-start latency compared to regular Zarr. +1. **No Catalog** - The spec does not extend beyond a single repository or provide a way to organize multiple repositories into a hierarchy. +1. **Access Controls** - Access control is the responsibility of the storage medium. +The spec is not designed to enable fine-grained access restrictions (e.g. only read specific arrays) within a single repository. + +### Storage Operations + +Icechunk requires that the storage system support the following operations: + +- **In-place write** - Strong read-after-write and list-after-write consistency is expected. Files are not moved or altered once they are written. +- **Conditional write if-not-exists** - For the commit process to be safe and consistent, the storage system must guard against two files of the same name being created at the same time. +- **Seekable reads** - Chunk file formats may require seek support (e.g. shards). +- **Deletes** - Delete files that are no longer used (via a garbage-collection operation). +- **Sorted List** - The storage system must allow the listing of directories / prefixes in a consistent sorted order. + +These requirements are compatible with object stores, like S3, as well as with filesystems. + +The storage system is not required to support random-access writes. Once written, chunk and metadata files are immutable until they are deleted. + +## Specification + +### Overview + +Icechunk uses a series of linked metadata files to describe the state of the repository. + +- The **Snapshot file** records all of the different arrays and groups in the repository, plus their metadata. Every new commit creates a new snapshot file. The snapshot file contains pointers to one or more chunk manifest files and [optionally] attribute files. +- **Chunk manifests** store references to individual chunks. A single manifest may store references for multiple arrays or a subset of all the references for a single array. +- **Attributes files** provide a way to store additional user-defined attributes for arrays and groups outside of the structure file. This is important if attributes are very large, otherwise, they will be stored inline in the snapshot file. +- **Chunk files** store the actual compressed chunk data, potentially containing data for multiple chunks in a single file. +- **Reference files** track the state of branches and tags, containing a lightweight pointer to a snapshot file. Transactions on a branch are committed by creating the next branch file in a sequence. + +When reading from object store, the client opens the latest branch or tag file to obtain a pointer to the relevant snapshot file. +The client then reads the snapshot file to determine the structure and hierarchy of the repository. +When fetching data from an array, the client first examines the chunk manifest file[s] for that array and finally fetches the chunks referenced therein. + +When writing a new repository snapshot, the client first writes a new set of chunks and chunk manifests, and then generates a new snapshot file. +Finally, in an atomic put-if-not-exists operation, to commit the transaction, it creates the next branch file in the sequence. +This operation may fail if a different client has already committed the next snapshot. +In this case, the client may attempt to resolve the conflicts and retry the commit. + + +```mermaid +flowchart TD + subgraph metadata[Metadata] + subgraph reference_files[Reference Files] + old_branch[Main Branch File 001] + branch[Main Branch File 002] + end + subgraph snapshots[Snapshots] + snapshot1[Snapshot File 1] + snapshot2[Snapshot File 2] + end + subgraph attributes[Attributes] + attrs[Attribute File] + end + subgraph manifests[Manifests] + manifestA[Chunk Manifest A] + manifestB[Chunk Manifest B] + end + end + subgraph data + chunk1[Chunk File 1] + chunk2[Chunk File 2] + chunk3[Chunk File 3] + chunk4[Chunk File 4] + end + + branch -- snapshot ID --> snapshot2 + snapshot1 --> attrs + snapshot1 --> manifestA + snapshot2 --> attrs + snapshot2 -->manifestA + snapshot2 -->manifestB + manifestA --> chunk1 + manifestA --> chunk2 + manifestB --> chunk3 + manifestB --> chunk4 + +``` + +### File Layout + +All data and metadata files are stored within a root directory (typically a prefix within an object store) using the following directory structure. + +- `$ROOT` base URI (s3, gcs, local directory, etc.) +- `$ROOT/refs/` reference files +- `$ROOT/snapshots/` snapshot files +- `$ROOT/attributes/` attribute files +- `$ROOT/manifests/` chunk manifests +- `$ROOT/chunks/` chunks + +### File Formats + +!!! warning + The actual file formats used for each type of metadata file are in flux. The spec currently describes the data structures encoded in these files, rather than a specific file format. + + +### Reference Files + +Similar to Git, Icechunk supports the concept of _branches_ and _tags_. +These references point to a specific snapshot of the repository. + +- **Branches** are _mutable_ references to a snapshot. + Repositories may have one or more branches. + The default branch name is `main`. + Repositories must always have a `main` branch, which is used to detect the existence of a valid repository in a given path. + After creation, branches may be updated to point to a different snapshot. +- **Tags** are _immutable_ references to a snapshot. + A repository may contain zero or more tags. + After creation, tags may never be updated, unlike in Git. + +References are very important in the Icechunk design. +Creating or updating references is the point at which consistency and atomicity of Icechunk transactions is enforced. +Different client sessions may simultaneously create two inconsistent snapshots; however, only one session may successfully update a reference to point it to its snapshot. + +References (both branches and tags) are stored as JSON files, the content is a JSON object with: + +* keys: a single key `"snapshot"`, +* value: a string representation of the snapshot id, using [Base 32 Crockford](https://www.crockford.com/base32.html) encoding. The snapshot id is 12 byte random binary, so the encoded string has 20 characters. + + +Here is an example of a JSON file corresponding to a tag or branch: + +```json +{"snapshot":"VY76P925PRY57WFEK410"} +``` + +#### Creating and Updating Branches + +The process of creating and updating branches is designed to use the limited consistency guarantees offered by object storage to ensure transactional consistency. +When a client checks out a branch, it obtains a specific snapshot ID and uses this snapshot as the basis for any changes it creates during its session. +The client creates a new snapshot and then updates the branch reference to point to the new snapshot (a "commit"). +However, when updating the branch reference, the client must detect whether a _different session_ has updated the branch reference in the interim, possibly retrying or failing the commit if so. +This is an "optimistic concurrency" strategy; the resolution mechanism can be expensive, but conflicts are expected to be infrequent. + +All popular object stores support a "create if not exists" operation. +In other words, object stores can guard against the race condition which occurs when two sessions attempt to create the same file at the same time. +This motivates the design of Icechunk's branch file naming convention. + +Each commit to an Icechunk branch augments a counter called the _sequence number_. +The first commit creates sequence number 0. +The next commit creates sequence number 1. Etc. +This sequence number is encoded into the branch reference file name. + +When a client checks out a branch, it keeps track of its current sequence number _N_. +When it tries to commit, it attempts to create the file corresponding to sequence number _N + 1_ in an atomic "create if not exists" operation. +If this succeeds, the commit is successful. +If this fails (because another client created that file already), the commit fails. +At this point, the client may choose to retry its commit (possibly re-reading the updated data) and then create sequence number _N + 2_. + +Branch references are stored in the `refs/` directory within a subdirectory corresponding to the branch name prepended by the string `branch.`: `refs/branch.$BRANCH_NAME/`. +Branch names may not contain the `/` character. + +To facilitate easy lookups of the latest branch reference, we use the following encoding for the sequence number: +- subtract the sequence number from the integer `1099511627775` +- encode the resulting integer as a string using [Base 32 Crockford](https://www.crockford.com/base32.html) +- left-padding the string with 0s to a length of 8 characters +This produces a deterministic sequence of branch file names in which the latest sequence always appears first when sorted lexicographically, facilitating easy lookup by listing the object store. + +The full branch file name is then given by `refs/branch.$BRANCH_NAME/$ENCODED_SEQUENCE.json`. + +For example, the first main branch file is in a store, corresponding with sequence number 0, is always named `refs/branch.main/ZZZZZZZZ.json`. +The branch file for sequence number 100 is `refs/branch.main/ZZZZZZWV.json`. +The maximum number of commits allowed in an Icechunk repository is consequently `1099511627775`, +corresponding to the state file `refs/branch.main/00000000.json`. + +#### Tags + +Since tags are immutable, they are simpler than branches. + +Tag files follow the pattern `refs/tag.$TAG_NAME/ref.json`. + +Tag names may not contain the `/` character. + +When creating a new tag, the client attempts to create the tag file using a "create if not exists" operation. +If successful, the tag is created successful. +If not, that means another client has already created that tag. + +Tags cannot be deleted once created. + +### Snapshot Files + +The snapshot file fully describes the schema of the repository, including all arrays and groups. + +The snapshot file is currently encoded using [MessagePack](https://msgpack.org/), but this may change before Icechunk version 1.0. Given the alpha status of this spec, the best way to understand the information stored +in the snapshot file is through the data structure used internally by the Icechunk library for serialization. This data structure will most certainly change before the spec stabilization: + +```rust +pub struct Snapshot { + pub icechunk_snapshot_format_version: IcechunkFormatVersion, + pub icechunk_snapshot_format_flags: BTreeMap, + + pub manifest_files: Vec, + pub attribute_files: Vec, + + pub total_parents: u32, + pub short_term_parents: u16, + pub short_term_history: VecDeque, + + pub metadata: SnapshotMetadata, + pub started_at: DateTime, + pub properties: SnapshotProperties, + nodes: BTreeMap, +} +``` + +To get full details on what each field contains, please refer to the [Icechunk library code](https://github.com/earth-mover/icechunk/blob/f460a56577ec560c4debfd89e401a98153cd3560/icechunk/src/format/snapshot.rs#L97). + + +### Attributes Files + +Attribute files hold user-defined attributes separately from the snapshot file. + +!!! warning + Attribute files have not been implemented. + +The on-disk format for attribute files has not been defined yet, but it will probably be a +MessagePack serialization of the attributes map. + +### Chunk Manifest Files + +A chunk manifest file stores chunk references. +Chunk references from multiple arrays can be stored in the same chunk manifest. +The chunks from a single array can also be spread across multiple manifests. + +Manifest files are currently encoded using [MessagePack](https://msgpack.org/), but this may change before Icechunk version 1.0. Given the alpha status of this spec, the best way to understand the information stored +in the snapshot file is through the data structure used internally by the Icechunk library. This data structure will most certainly change before the spec stabilization: + +```rust +pub struct Manifest { + pub icechunk_manifest_format_version: IcechunkFormatVersion, + pub icechunk_manifest_format_flags: BTreeMap, + chunks: BTreeMap<(NodeId, ChunkIndices), ChunkPayload>, +} + +pub enum ChunkPayload { + Inline(Bytes), + Virtual(VirtualChunkRef), + Ref(ChunkRef), +} +``` + +The most important part to understand from the data structure is the fact that manifests can hold three types of references: + +* Native (`Ref`), pointing to the id of a chunk within the Icechunk repository. +* Inline (`Inline`), an optimization for very small chunks that can be embedded directly in the manifest. Mostly used for coordinate arrays. +* Virtual (`Virtual`), pointing to a region of a file outside of the Icechunk repository, for example, + a chunk that is inside a NetCDF file in object store + +To get full details on what each field contains, please refer to the [Icechunk library code](https://github.com/earth-mover/icechunk/blob/f460a56577ec560c4debfd89e401a98153cd3560/icechunk/src/format/manifest.rs#L106). + +### Chunk Files + +Chunk files contain the compressed binary chunks of a Zarr array. +Icechunk permits quite a bit of flexibility about how chunks are stored. +Chunk files can be: + +- One chunk per chunk file (i.e. standard Zarr) +- Multiple contiguous chunks from the same array in a single chunk file (similar to Zarr V3 shards) +- Chunks from multiple different arrays in the same file +- Other file types (e.g. NetCDF, HDF5) which contain Zarr-compatible chunks + +Applications may choose to arrange chunks within files in different ways to optimize I/O patterns. + +## Algorithms + +### Initialize New Repository + +A new repository is initialized by creating a new [possibly empty] snapshot file and then creating the first file in the main branch sequence. + +If another client attempts to initialize a repository in the same location, only one can succeed. + +### Read from Repository + +#### From Snapshot ID + +If the specific snapshot ID is known, a client can open it directly in read only mode. + +1. Use the specified snapshot ID to fetch the snapshot file. +1. Fetch desired attributes and values from arrays. + +#### From Branch + +Usually, a client will want to read from the latest branch (e.g. `main`). + +1. List the object store prefix `refs/branch.$BRANCH_NAME/` to obtain the latest branch file in the sequence. Due to the encoding of the sequence number, this should be the _first file_ in lexicographical order. +1. Read the branch file JSON contents to obtain the snapshot ID. +1. Use the snapshot ID to fetch the snapshot file. +1. Fetch desired attributes and values from arrays. + +#### From Tag + +1. Read the tag file found at `refs/tag.$TAG_NAME/ref.json` to obtain the snapshot ID. +1. Use the snapshot ID to fetch the snapshot file. +1. Fetch desired attributes and values from arrays. + +### Write New Snapshot + +1. Open a repository at a specific branch as described above, keeping track of the sequence number and branch name in the session context. +1. [optional] Write new chunk files. +1. [optional] Write new chunk manifests. +1. Write a new snapshot file. +1. Attempt to write the next branch file in the sequence + 1. If successful, the commit succeeded and the branch is updated. + 1. If unsuccessful, attempt to reconcile and retry the commit. + +### Create New Tag + +A tag can be created from any snapshot. + +1. Open the repository at a specific snapshot. +1. Attempt to create the tag file. + a. If successful, the tag was created. + b. If unsuccessful, the tag already exists. diff --git a/docs/docs/stylesheets/global.css b/docs/docs/stylesheets/global.css new file mode 100644 index 00000000..6a302f3a --- /dev/null +++ b/docs/docs/stylesheets/global.css @@ -0,0 +1,19 @@ +/* Global Adjustments */ + +/* Adjust spacing between logo and header */ +[dir=ltr] .md-header__title { + margin-left: 0px; +} + +/* +TODO: find a way to show all pages in left sidebar +.md-nav--lifted>.md-nav__list>.md-nav__item, .md-nav--lifted>.md-nav__title { + display:block; +} + */ + + +/* Remove cookie consent overlay */ +.md-consent__overlay { + display:none; +} \ No newline at end of file diff --git a/docs/docs/stylesheets/homepage.css b/docs/docs/stylesheets/homepage.css new file mode 100644 index 00000000..667a2d29 --- /dev/null +++ b/docs/docs/stylesheets/homepage.css @@ -0,0 +1,362 @@ +/* Homepage styles */ + +/* Application header should be static for the landing page */ +.md-header { + position: initial; + } + + /* Remove spacing, as we cannot hide it completely */ + .md-main__inner { + margin: 0; + max-width: 100%; + position: relative; + } + + /* Hide breadcrumb in content */ + #home-content a:first-child { + display:none; + } + + /* Hide table of contents for desktop */ + @media screen and (min-width: 60em) { + .md-sidebar--secondary { + display: none !important; + } + } + + /* Hide navigation for desktop */ + @media screen and (min-width: 76.25em) { + .md-sidebar--primary { + display: none; + } + } + +/* Hero */ +#hero-container { + + --ice-height: 390px; + --header-height: 155px; + + min-height: calc(100vh - var(--header-height)); + position: relative; + text-align:center; + background: linear-gradient(180deg, var(--md-primary-fg-color) 0%, var(--md-primary-fg-color--dark) 100%); +} + +.mdx-hero { + min-height: calc(100vh - var(--header-height) - var(--ice-height)); +} + +h1.hero-title { + font-size: 4rem; + font-weight: bold; + text-transform: uppercase; + margin-bottom: 10px; + color: white; + color: var(--md-primary-bg-color); + text-shadow: 5px 5px 0px rgba(0,0,0,0.2); + +} + +h3.hero-subtitle { + color: var(--md-primary-fg-color--light); + margin-top:0px; + margin-bottom: 60px; + font-size: 1.5rem; + text-transform: lowercase; + text-shadow: 3px 3px 0px rgba(0,0,0,0.2); + +} + + +.hero-cta-button { + background-color: var(--md-primary-bg-color); + box-shadow: 5px 5px 0px rgba(0,0,0,0.1); +} + +.links { + display: flex; + justify-content: center; + align-items: flex-start; + gap: 20px; + margin-top: 60px; +} + +.links img { + box-shadow: 5px 5px 0px rgba(0,0,0,0.1); +} + + +.by-line-wrapper { + margin: 60px auto; + width: 50%; + +} + +.by-line-container { + + border-radius: 20px; + padding: 10px; + color: var(--md-primary-fg-color--light); + font-size: 1.25rem; + justify-content: center; + display: flex; + align-items: center; + gap: 20px; + +} + +.by-line { + display: flex; + align-items: center; + gap: 20px; + text-shadow: 3px 3px 0px rgba(0,0,0,0.2); + +} + +.heart-container { + display:block; + padding-top:15px; +} + +@keyframes heart-beat { + 0% { transform: scale(1); } + 50% { transform: scale(1.5); } + 100% { transform: scale(1); } +} + +.heart-image { + width: 40px; + min-width: 40px; + height: auto; + + animation-name: heart-beat; + animation-timeline: scroll(root); + animation-timing-function: ease-in; + animation-fill-mode: forwards; + animation-range: entry 10% cover 50%; +} + +.earthmover-wordmark { + width: 260px; +} + + +/* ice */ +.ice-container { + height: var(--ice-height); + width: 100%; + margin: 0 auto; + max-width: 1745px; + overflow: hidden; + position:relative; + + -ms-overflow-style: none; /* IE and Edge */ + scrollbar-width: none; /* Firefox */ +} + +.ice-container::-webkit-scrollbar { + display:none; +} + +.ice-image { + width: 1745px; + position:relative; +} + +.ice-shore { + height: var(--ice-height); + width: 1745px; + margin:0 auto; + background-image: url('../assets/hero/hero-bottom.svg'); + background-repeat: no-repeat; + background-position: bottom center; + z-index:1; + position:relative; +} + +.ice-cube { + height: 182px; + width: 140px; + background-image: url('../assets/hero/ice-1.svg'); + background-repeat: no-repeat; + background-position: top left; + position: absolute; + + z-index:0; + + animation-name: ice-parallax; + animation-timeline: scroll(root); + animation-timing-function: ease-in; + animation-fill-mode: forwards; +} + +@keyframes ice-parallax-1 { + from { transform: translateY(0px); } + to { transform: translateY(-220px); } +} + +#ice-1 { + animation-name: ice-parallax-1; + animation-range: entry 20% cover 60%; + + bottom:-29px; + right:421px; +} + +@keyframes ice-parallax-2 { + from { transform: translateY(0px) scale(1.5); } + to { transform: translateY(-200px) scale(1.5); } +} + +#ice-2 { + transform: scale(1.5); + animation-name: ice-parallax-2; + animation-range: entry 30% cover 70%; + + + bottom:-45px; + left:350px; +} + +@keyframes ice-parallax-3 { + from { transform: translateY(0px) scale(0.8); } + to { transform: translateY(-260px) scale(0.8); } +} + +#ice-3 { + transform: scale(0.8); + animation-name: ice-parallax-3; + animation-range: entry 0% cover 40%; + + bottom:-36px; + left:640px; +} + +/* Dark theme */ +[data-md-color-scheme="slate"]{ + .ice-shore { + background-image: url('../assets/hero/hero-bottom-dark.svg'); + } + .ice-cube { + background-image: url('../assets/hero/ice-1-dark.svg'); + + } +} + + +/* Mobile */ +@media screen and (max-width: 60em) { + + .ice-image { + transform:translateX(-56%); + } + + #hero-image { + min-width: 200px; + } + + h1.hero-title{ + font-size:2rem; + } + + h3.hero-subtitle { + font-size: 1rem; + } + + + .links { + flex-direction: column; + align-items: center; + } + + .by-line-wrapper { + width:70%; + } + + .by-line-container { + flex-direction: column; + } + .by-line span { + font-size: 0.85rem; + } + + .earthmover-wordmark { + width: 200px; + } +} + + +/* Logo */ + +@keyframes bob { + from { + transform: translate3d(0,0,0); + } + to { + transform: translate3d(0, -2%, 0); + } +} + +@keyframes ripple { + 0% { + transform: scale(1); + opacity: 0; + } + 25%{ + opacity: 0.25; + } + 100% { + transform: scale(1.5); + opacity: 0; + } +} + +#cube-logo { + width: 30vh; + max-width: 600px; + height: auto; + margin-top: 5%; +} + +#cube-container { + animation: bob 2s ease-in-out alternate infinite; +} + +#ripple-1-front, #ripple-1-back { + opacity: 0; + transform-box: fill-box; + animation: ripple 2s ease-in-out infinite; +} + +#ripple-2-front, #ripple-2-back { + opacity: 0; + transform-box: fill-box; + animation: ripple 2s 1s ease-in-out infinite; +} + +#ripple-1-front, #ripple-2-front { + transform-origin: center bottom; +} + +#ripple-1-back, #ripple-2-back { + transform-origin: center top; +} + +.no-pointer { + pointer-events: none; +} + + +.small-cube path { + transition: filter 4s ease-in-out; + transform-box: fill-box; + transform-origin: center center; +} + + +.small-cube:hover path { + filter: hue-rotate(620deg) saturate(2000%) invert(70%); + transition: all 0s ease-in-out; + +} diff --git a/docs/docs/stylesheets/notebooks.css b/docs/docs/stylesheets/notebooks.css new file mode 100644 index 00000000..583875ef --- /dev/null +++ b/docs/docs/stylesheets/notebooks.css @@ -0,0 +1,9 @@ +/* Notebook Adjustments */ +/* Hides prompts ([In]/[Out]) */ +.jp-InputPrompt { + display: none !important; +} + +.jp-OutputPrompt { + display: none !important; +} \ No newline at end of file diff --git a/docs/docs/stylesheets/theme.css b/docs/docs/stylesheets/theme.css new file mode 100644 index 00000000..4f1a9bd9 --- /dev/null +++ b/docs/docs/stylesheets/theme.css @@ -0,0 +1,33 @@ +/* Colors + @see https://m2.material.io/design/color/the-color-system.html#tools-for-picking-colors + @see https://github.com/squidfunk/mkdocs-material/blob/master/src/templates/assets/stylesheets/main/_colors.scss +*/ + +[data-md-color-scheme="light"] { + /* Primary color shades */ + --md-primary-fg-color: #5ea0d1; + --md-primary-fg-color--light: #e4f1f8; + --md-primary-fg-color--dark: #1d467f; + + --md-primary-bg-color: hsla(0, 0%, 100%, 1); + --md-primary-bg-color--light: hsla(0, 0%, 100%, 0.7); + + /* Accent color shades */ + --md-accent-fg-color: #a653ff; + --md-accent-fg-color--transparent: rgba(166, 83, 255, 0.7); + //--md-accent-bg-color: hsla(0, 0%, 100%, 1); + //--md-accent-bg-color--light: hsla(0, 0%, 100%, 0.7); +} + +/* Dark */ +[data-md-color-scheme="slate"] { + --md-primary-fg-color: #1d467f; + --md-primary-fg-color--light: #e4f1f8; + --md-primary-fg-color--dark: #5ea0d1; + + /* Accent color shades */ + --md-accent-fg-color: #a653ff; + --md-accent-fg-color--transparent: rgba(166, 83, 255, 0.7); + //--md-accent-bg-color: hsla(0, 0%, 100%, 1); + //--md-accent-bg-color--light: hsla(0, 0%, 100%, 0.7); +} diff --git a/docs/macros.py b/docs/macros.py new file mode 100644 index 00000000..925cc213 --- /dev/null +++ b/docs/macros.py @@ -0,0 +1,48 @@ +import os +from pathlib import Path +import logging + +def define_env(env): + # TODO: is there a better way of including these files and dirs? Symlinking seems error prone... + # Potentially use: https://github.com/backstage/mkdocs-monorepo-plugin + def symlink_external_dirs(): + """ + Creates symbolic links from external directories to the docs_dir. + """ + try: + # Resolve paths for docs and monorepo root + docs_dir = Path('./docs').resolve() + monorepo_root = docs_dir.parent.parent + + # Symlinked paths + external_sources = { + monorepo_root / 'icechunk-python' / 'notebooks' : docs_dir / 'icechunk-python' / 'notebooks', + monorepo_root / 'icechunk-python' / 'examples' : docs_dir / 'icechunk-python' / 'examples', + } + + for src, target in external_sources.items(): + if not src.exists(): + logging.error(f"Source directory does not exist: {src}") + raise FileNotFoundError(f"Source directory does not exist: {src}") + + # Ensure parent directory exists + target.parent.mkdir(parents=True, exist_ok=True) + + # Remove existing symlink or directory if it exists + if target.is_symlink() or target.exists(): + if target.is_dir() and not target.is_symlink(): + logging.error(f"Directory {target} already exists and is not a symlink.") + raise Exception(f"Directory {target} already exists and is not a symlink.") + target.unlink() + logging.info(f"Removed existing symlink or directory at: {target}") + + # Create symbolic link + os.symlink(src, target) + logging.info(f"Created symlink: {target} -> {src}") + + except Exception as e: + logging.error(f"Error creating symlinks: {e}") + raise e + + # Execute the symlink creation + symlink_external_dirs() \ No newline at end of file diff --git a/docs/mkdocs.yml b/docs/mkdocs.yml new file mode 100644 index 00000000..42a7637a --- /dev/null +++ b/docs/mkdocs.yml @@ -0,0 +1,190 @@ +site_name: Icechunk +site_description: >- + Open-source, cloud-native transactional tensor storage engine +site_author: Earthmover PBC +site_url: https://icechunk.io +repo_url: https://github.com/earth-mover/icechunk +repo_name: earth-mover/icechunk +copyright: Earthmover PBC # @see overrides/partials/footer.html + +site_dir: ./.site + +extra_css: + - stylesheets/theme.css + - stylesheets/global.css + - stylesheets/notebooks.css + +theme: + name: material + custom_dir: overrides + logo: assets/logo-wire.svg + favicon: assets/favicon/favicon-96x96.png + palette: + + # Palette toggle for automatic mode + #- media: "(prefers-color-scheme)" + # toggle: + # icon: material/brightness-auto + # name: Switch to light mode + + # Light Mode + - media: "(prefers-color-scheme: light)" + scheme: light + toggle: + icon: material/weather-night + name: Switch to dark mode + + # Dark Mode + - media: "(prefers-color-scheme: dark)" + scheme: slate + toggle: + icon: material/weather-sunny + name: Switch to light mode + + features: + - navigation.instant + - navigation.instant.prefetch + - navigation.instant.progress + - navigation.tracking + - navigation.indexes + - navigation.footer + - navigation.tabs + - navigation.tabs.sticky + #- navigation.expand + - toc.follow + - navigation.top + - announce.dismiss + - content.code.copy + - content.code.annotate + icon: + repo: fontawesome/brands/github + font: + text: Roboto + code: Roboto Mono + +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/earth-mover/icechunk + - icon: fontawesome/brands/python + link: https://pypi.org/project/icechunk/ + - icon: fontawesome/brands/rust + link: https://crates.io/crates/icechunk + - icon: fontawesome/brands/slack + link: https://join.slack.com/t/earthmover-community/shared_invite/zt-2cwje92ir-xU3CfdG8BI~4CJOJy~sceQ + - icon: fontawesome/brands/x-twitter + link: https://x.com/earthmoverhq + generator: false + status: + new: Recently Added + deprecated: Deprecated + analytics: + provider: google + property: G-TNHH1RF342 + feedback: + title: Was this page helpful? + ratings: + - icon: material/emoticon-happy-outline + name: This page was helpful + data: 1 + note: >- + Thanks for your feedback! + - icon: material/emoticon-sad-outline + name: This page could be improved + data: 0 + note: >- + Thanks for your feedback! Help us improve this page by + using our feedback form. + consent: + title: Cookie consent + description: >- + We use cookies to recognize your repeated visits and preferences, as well + as to measure the effectiveness of our documentation and whether users + find what they're searching for. With your consent, you're helping us to + make our documentation better. + +plugins: + #- mike # TODO: https://squidfunk.github.io/mkdocs-material/setup/setting-up-versioning/ + #- optimize #TODO: https://squidfunk.github.io/mkdocs-material/plugins/optimize/ + - search + - social + - include-markdown + - open-in-new-tab + - mkdocs-breadcrumbs-plugin: + exclude_paths: + #- icechunk-python + - assets + - stylesheets + - index.md + generate_home_index: false + - mermaid2 + - minify: + minify_html: true + - awesome-pages: + collapse_single_pages: true + - macros: + module_name: macros + - git-revision-date-localized: + #enabled: !ENV [CI, false] + - git-authors + - git-committers: + repository: earth-mover/icechunk + branch: main + #enabled: !ENV [CI, false] + exclude: + - index.md + - mkdocstrings: + default_handler: python + handlers: + python: + paths: [../icechunk-python/python] + + - mkdocs-jupyter: + include_source: True + #include: + # - "icechunk-python/notebooks/*.ipynb" + # - "icechunk-python/examples/*.py" + +markdown_extensions: + - admonition + - tables + - pymdownx.inlinehilite + - pymdownx.snippets + - pymdownx.superfences: + # make exceptions to highlighting of code: + custom_fences: + - name: mermaid + class: mermaid + format: !!python/name:mermaid2.fence_mermaid_custom + - pymdownx.tabbed: + alternate_style: true + - pymdownx.highlight: + anchor_linenums: true + line_spans: __span + pygments_lang_class: true + - attr_list + - md_in_html + - pymdownx.emoji: + emoji_index: !!python/name:material.extensions.emoji.twemoji + emoji_generator: !!python/name:material.extensions.emoji.to_svg + +nav: + - Home: index.md + - Overview: overview.md + - FAQ: faq.md + - Icechunk Python: + - icechunk-python/quickstart.md + - icechunk-python/configuration.md + - icechunk-python/xarray.md + - icechunk-python/version-control.md + - Virtual Datasets: icechunk-python/virtual.md + - API Reference: icechunk-python/reference.md +# - Examples: +# - ... | flat | icechunk-python/examples/*.py +# - Notebooks: +# - ... | flat | icechunk-python/notebooks/*.ipynb + - Icechunk Rust: icechunk-rust.md + - contributing.md + - Sample Datasets: sample-datasets.md + - Spec: spec.md + diff --git a/docs/overrides/home.html b/docs/overrides/home.html new file mode 100644 index 00000000..d6b2e83c --- /dev/null +++ b/docs/overrides/home.html @@ -0,0 +1,74 @@ +{% extends "main.html" %} + + +{% block announce %} + +{% endblock %} + + +{% block tabs %} + {{ super() }} + + + + + +
+
+ + +
+ {% include "partials/cube.html" %} +
+ + +
+

Icechunk

+

{{ config.site_description }}

+ + Get Started + + + + + +
+
+
+
+
+
+
+
+
+
+
+{% endblock %} + + +{% block content %} +
+
+ {{ page.content }} +
+
+{% endblock %} + diff --git a/docs/overrides/main.html b/docs/overrides/main.html new file mode 100644 index 00000000..63913c18 --- /dev/null +++ b/docs/overrides/main.html @@ -0,0 +1 @@ +{% extends "base.html" %} \ No newline at end of file diff --git a/docs/overrides/partials/copyright.html b/docs/overrides/partials/copyright.html new file mode 100644 index 00000000..f55e0083 --- /dev/null +++ b/docs/overrides/partials/copyright.html @@ -0,0 +1,14 @@ + \ No newline at end of file diff --git a/docs/overrides/partials/cube.html b/docs/overrides/partials/cube.html new file mode 100644 index 00000000..b2756cc9 --- /dev/null +++ b/docs/overrides/partials/cube.html @@ -0,0 +1,1278 @@ + diff --git a/docs/overrides/partials/nav.html b/docs/overrides/partials/nav.html new file mode 100644 index 00000000..e3162d78 --- /dev/null +++ b/docs/overrides/partials/nav.html @@ -0,0 +1,51 @@ + + +{% import "partials/nav-item.html" as item with context %} + + +{% set class = "md-nav md-nav--primary" %} + +{% if "toc.integrate" in features %} + {% set class = class ~ " md-nav--integrated" %} +{% endif %} + + + \ No newline at end of file diff --git a/docs/poetry.lock b/docs/poetry.lock new file mode 100644 index 00000000..9daa2327 --- /dev/null +++ b/docs/poetry.lock @@ -0,0 +1,2739 @@ +# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand. + +[[package]] +name = "appnope" +version = "0.1.4" +description = "Disable App Nap on macOS >= 10.9" +optional = false +python-versions = ">=3.6" +files = [ + {file = "appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c"}, + {file = "appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee"}, +] + +[[package]] +name = "asttokens" +version = "2.4.1" +description = "Annotate AST trees with source code positions" +optional = false +python-versions = "*" +files = [ + {file = "asttokens-2.4.1-py2.py3-none-any.whl", hash = "sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24"}, + {file = "asttokens-2.4.1.tar.gz", hash = "sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0"}, +] + +[package.dependencies] +six = ">=1.12.0" + +[package.extras] +astroid = ["astroid (>=1,<2)", "astroid (>=2,<4)"] +test = ["astroid (>=1,<2)", "astroid (>=2,<4)", "pytest"] + +[[package]] +name = "attrs" +version = "24.2.0" +description = "Classes Without Boilerplate" +optional = false +python-versions = ">=3.7" +files = [ + {file = "attrs-24.2.0-py3-none-any.whl", hash = "sha256:81921eb96de3191c8258c199618104dd27ac608d9366f5e35d011eae1867ede2"}, + {file = "attrs-24.2.0.tar.gz", hash = "sha256:5cfb1b9148b5b086569baec03f20d7b6bf3bcacc9a42bebf87ffaaca362f6346"}, +] + +[package.extras] +benchmark = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-codspeed", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +cov = ["cloudpickle", "coverage[toml] (>=5.3)", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +dev = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pre-commit", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +docs = ["cogapp", "furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +tests = ["cloudpickle", "hypothesis", "mypy (>=1.11.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] +tests-mypy = ["mypy (>=1.11.1)", "pytest-mypy-plugins"] + +[[package]] +name = "babel" +version = "2.16.0" +description = "Internationalization utilities" +optional = false +python-versions = ">=3.8" +files = [ + {file = "babel-2.16.0-py3-none-any.whl", hash = "sha256:368b5b98b37c06b7daf6696391c3240c938b37767d4584413e8438c5c435fa8b"}, + {file = "babel-2.16.0.tar.gz", hash = "sha256:d1f3554ca26605fe173f3de0c65f750f5a42f924499bf134de6423582298e316"}, +] + +[package.extras] +dev = ["freezegun (>=1.0,<2.0)", "pytest (>=6.0)", "pytest-cov"] + +[[package]] +name = "beautifulsoup4" +version = "4.12.3" +description = "Screen-scraping library" +optional = false +python-versions = ">=3.6.0" +files = [ + {file = "beautifulsoup4-4.12.3-py3-none-any.whl", hash = "sha256:b80878c9f40111313e55da8ba20bdba06d8fa3969fc68304167741bbf9e082ed"}, + {file = "beautifulsoup4-4.12.3.tar.gz", hash = "sha256:74e3d1928edc070d21748185c46e3fb33490f22f52a3addee9aee0f4f7781051"}, +] + +[package.dependencies] +soupsieve = ">1.2" + +[package.extras] +cchardet = ["cchardet"] +chardet = ["chardet"] +charset-normalizer = ["charset-normalizer"] +html5lib = ["html5lib"] +lxml = ["lxml"] + +[[package]] +name = "bleach" +version = "6.1.0" +description = "An easy safelist-based HTML-sanitizing tool." +optional = false +python-versions = ">=3.8" +files = [ + {file = "bleach-6.1.0-py3-none-any.whl", hash = "sha256:3225f354cfc436b9789c66c4ee030194bee0568fbf9cbdad3bc8b5c26c5f12b6"}, + {file = "bleach-6.1.0.tar.gz", hash = "sha256:0a31f1837963c41d46bbf1331b8778e1308ea0791db03cc4e7357b97cf42a8fe"}, +] + +[package.dependencies] +six = ">=1.9.0" +webencodings = "*" + +[package.extras] +css = ["tinycss2 (>=1.1.0,<1.3)"] + +[[package]] +name = "bracex" +version = "2.5.post1" +description = "Bash style brace expander." +optional = false +python-versions = ">=3.8" +files = [ + {file = "bracex-2.5.post1-py3-none-any.whl", hash = "sha256:13e5732fec27828d6af308628285ad358047cec36801598368cb28bc631dbaf6"}, + {file = "bracex-2.5.post1.tar.gz", hash = "sha256:12c50952415bfa773d2d9ccb8e79651b8cdb1f31a42f6091b804f6ba2b4a66b6"}, +] + +[[package]] +name = "cairocffi" +version = "1.7.1" +description = "cffi-based cairo bindings for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "cairocffi-1.7.1-py3-none-any.whl", hash = "sha256:9803a0e11f6c962f3b0ae2ec8ba6ae45e957a146a004697a1ac1bbf16b073b3f"}, + {file = "cairocffi-1.7.1.tar.gz", hash = "sha256:2e48ee864884ec4a3a34bfa8c9ab9999f688286eb714a15a43ec9d068c36557b"}, +] + +[package.dependencies] +cffi = ">=1.1.0" + +[package.extras] +doc = ["sphinx", "sphinx_rtd_theme"] +test = ["numpy", "pikepdf", "pytest", "ruff"] +xcb = ["xcffib (>=1.4.0)"] + +[[package]] +name = "cairosvg" +version = "2.7.1" +description = "A Simple SVG Converter based on Cairo" +optional = false +python-versions = ">=3.5" +files = [ + {file = "CairoSVG-2.7.1-py3-none-any.whl", hash = "sha256:8a5222d4e6c3f86f1f7046b63246877a63b49923a1cd202184c3a634ef546b3b"}, + {file = "CairoSVG-2.7.1.tar.gz", hash = "sha256:432531d72347291b9a9ebfb6777026b607563fd8719c46ee742db0aef7271ba0"}, +] + +[package.dependencies] +cairocffi = "*" +cssselect2 = "*" +defusedxml = "*" +pillow = "*" +tinycss2 = "*" + +[package.extras] +doc = ["sphinx", "sphinx-rtd-theme"] +test = ["flake8", "isort", "pytest"] + +[[package]] +name = "certifi" +version = "2024.8.30" +description = "Python package for providing Mozilla's CA Bundle." +optional = false +python-versions = ">=3.6" +files = [ + {file = "certifi-2024.8.30-py3-none-any.whl", hash = "sha256:922820b53db7a7257ffbda3f597266d435245903d80737e34f8a45ff3e3230d8"}, + {file = "certifi-2024.8.30.tar.gz", hash = "sha256:bec941d2aa8195e248a60b31ff9f0558284cf01a52591ceda73ea9afffd69fd9"}, +] + +[[package]] +name = "cffi" +version = "1.17.1" +description = "Foreign Function Interface for Python calling C code." +optional = false +python-versions = ">=3.8" +files = [ + {file = "cffi-1.17.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:df8b1c11f177bc2313ec4b2d46baec87a5f3e71fc8b45dab2ee7cae86d9aba14"}, + {file = "cffi-1.17.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f2cdc858323644ab277e9bb925ad72ae0e67f69e804f4898c070998d50b1a67"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:edae79245293e15384b51f88b00613ba9f7198016a5948b5dddf4917d4d26382"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45398b671ac6d70e67da8e4224a065cec6a93541bb7aebe1b198a61b58c7b702"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ad9413ccdeda48c5afdae7e4fa2192157e991ff761e7ab8fdd8926f40b160cc3"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5da5719280082ac6bd9aa7becb3938dc9f9cbd57fac7d2871717b1feb0902ab6"}, + {file = "cffi-1.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2bb1a08b8008b281856e5971307cc386a8e9c5b625ac297e853d36da6efe9c17"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:045d61c734659cc045141be4bae381a41d89b741f795af1dd018bfb532fd0df8"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:6883e737d7d9e4899a8a695e00ec36bd4e5e4f18fabe0aca0efe0a4b44cdb13e"}, + {file = "cffi-1.17.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:6b8b4a92e1c65048ff98cfe1f735ef8f1ceb72e3d5f0c25fdb12087a23da22be"}, + {file = "cffi-1.17.1-cp310-cp310-win32.whl", hash = "sha256:c9c3d058ebabb74db66e431095118094d06abf53284d9c81f27300d0e0d8bc7c"}, + {file = "cffi-1.17.1-cp310-cp310-win_amd64.whl", hash = "sha256:0f048dcf80db46f0098ccac01132761580d28e28bc0f78ae0d58048063317e15"}, + {file = "cffi-1.17.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a45e3c6913c5b87b3ff120dcdc03f6131fa0065027d0ed7ee6190736a74cd401"}, + {file = "cffi-1.17.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:30c5e0cb5ae493c04c8b42916e52ca38079f1b235c2f8ae5f4527b963c401caf"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f75c7ab1f9e4aca5414ed4d8e5c0e303a34f4421f8a0d47a4d019ceff0ab6af4"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1ed2dd2972641495a3ec98445e09766f077aee98a1c896dcb4ad0d303628e41"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:46bf43160c1a35f7ec506d254e5c890f3c03648a4dbac12d624e4490a7046cd1"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a24ed04c8ffd54b0729c07cee15a81d964e6fee0e3d4d342a27b020d22959dc6"}, + {file = "cffi-1.17.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:610faea79c43e44c71e1ec53a554553fa22321b65fae24889706c0a84d4ad86d"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:a9b15d491f3ad5d692e11f6b71f7857e7835eb677955c00cc0aefcd0669adaf6"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:de2ea4b5833625383e464549fec1bc395c1bdeeb5f25c4a3a82b5a8c756ec22f"}, + {file = "cffi-1.17.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:fc48c783f9c87e60831201f2cce7f3b2e4846bf4d8728eabe54d60700b318a0b"}, + {file = "cffi-1.17.1-cp311-cp311-win32.whl", hash = "sha256:85a950a4ac9c359340d5963966e3e0a94a676bd6245a4b55bc43949eee26a655"}, + {file = "cffi-1.17.1-cp311-cp311-win_amd64.whl", hash = "sha256:caaf0640ef5f5517f49bc275eca1406b0ffa6aa184892812030f04c2abf589a0"}, + {file = "cffi-1.17.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:805b4371bf7197c329fcb3ead37e710d1bca9da5d583f5073b799d5c5bd1eee4"}, + {file = "cffi-1.17.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:733e99bc2df47476e3848417c5a4540522f234dfd4ef3ab7fafdf555b082ec0c"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1257bdabf294dceb59f5e70c64a3e2f462c30c7ad68092d01bbbfb1c16b1ba36"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:da95af8214998d77a98cc14e3a3bd00aa191526343078b530ceb0bd710fb48a5"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d63afe322132c194cf832bfec0dc69a99fb9bb6bbd550f161a49e9e855cc78ff"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f79fc4fc25f1c8698ff97788206bb3c2598949bfe0fef03d299eb1b5356ada99"}, + {file = "cffi-1.17.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b62ce867176a75d03a665bad002af8e6d54644fad99a3c70905c543130e39d93"}, + {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:386c8bf53c502fff58903061338ce4f4950cbdcb23e2902d86c0f722b786bbe3"}, + {file = "cffi-1.17.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:4ceb10419a9adf4460ea14cfd6bc43d08701f0835e979bf821052f1805850fe8"}, + {file = "cffi-1.17.1-cp312-cp312-win32.whl", hash = "sha256:a08d7e755f8ed21095a310a693525137cfe756ce62d066e53f502a83dc550f65"}, + {file = "cffi-1.17.1-cp312-cp312-win_amd64.whl", hash = "sha256:51392eae71afec0d0c8fb1a53b204dbb3bcabcb3c9b807eedf3e1e6ccf2de903"}, + {file = "cffi-1.17.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:f3a2b4222ce6b60e2e8b337bb9596923045681d71e5a082783484d845390938e"}, + {file = "cffi-1.17.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:0984a4925a435b1da406122d4d7968dd861c1385afe3b45ba82b750f229811e2"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d01b12eeeb4427d3110de311e1774046ad344f5b1a7403101878976ecd7a10f3"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:706510fe141c86a69c8ddc029c7910003a17353970cff3b904ff0686a5927683"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de55b766c7aa2e2a3092c51e0483d700341182f08e67c63630d5b6f200bb28e5"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c59d6e989d07460165cc5ad3c61f9fd8f1b4796eacbd81cee78957842b834af4"}, + {file = "cffi-1.17.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd398dbc6773384a17fe0d3e7eeb8d1a21c2200473ee6806bb5e6a8e62bb73dd"}, + {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3edc8d958eb099c634dace3c7e16560ae474aa3803a5df240542b305d14e14ed"}, + {file = "cffi-1.17.1-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:72e72408cad3d5419375fc87d289076ee319835bdfa2caad331e377589aebba9"}, + {file = "cffi-1.17.1-cp313-cp313-win32.whl", hash = "sha256:e03eab0a8677fa80d646b5ddece1cbeaf556c313dcfac435ba11f107ba117b5d"}, + {file = "cffi-1.17.1-cp313-cp313-win_amd64.whl", hash = "sha256:f6a16c31041f09ead72d69f583767292f750d24913dadacf5756b966aacb3f1a"}, + {file = "cffi-1.17.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:636062ea65bd0195bc012fea9321aca499c0504409f413dc88af450b57ffd03b"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c7eac2ef9b63c79431bc4b25f1cd649d7f061a28808cbc6c47b534bd789ef964"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e221cf152cff04059d011ee126477f0d9588303eb57e88923578ace7baad17f9"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:31000ec67d4221a71bd3f67df918b1f88f676f1c3b535a7eb473255fdc0b83fc"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6f17be4345073b0a7b8ea599688f692ac3ef23ce28e5df79c04de519dbc4912c"}, + {file = "cffi-1.17.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0e2b1fac190ae3ebfe37b979cc1ce69c81f4e4fe5746bb401dca63a9062cdaf1"}, + {file = "cffi-1.17.1-cp38-cp38-win32.whl", hash = "sha256:7596d6620d3fa590f677e9ee430df2958d2d6d6de2feeae5b20e82c00b76fbf8"}, + {file = "cffi-1.17.1-cp38-cp38-win_amd64.whl", hash = "sha256:78122be759c3f8a014ce010908ae03364d00a1f81ab5c7f4a7a5120607ea56e1"}, + {file = "cffi-1.17.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b2ab587605f4ba0bf81dc0cb08a41bd1c0a5906bd59243d56bad7668a6fc6c16"}, + {file = "cffi-1.17.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:28b16024becceed8c6dfbc75629e27788d8a3f9030691a1dbf9821a128b22c36"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1d599671f396c4723d016dbddb72fe8e0397082b0a77a4fab8028923bec050e8"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ca74b8dbe6e8e8263c0ffd60277de77dcee6c837a3d0881d8c1ead7268c9e576"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7f5baafcc48261359e14bcd6d9bff6d4b28d9103847c9e136694cb0501aef87"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98e3969bcff97cae1b2def8ba499ea3d6f31ddfdb7635374834cf89a1a08ecf0"}, + {file = "cffi-1.17.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdf5ce3acdfd1661132f2a9c19cac174758dc2352bfe37d98aa7512c6b7178b3"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:9755e4345d1ec879e3849e62222a18c7174d65a6a92d5b346b1863912168b595"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:f1e22e8c4419538cb197e4dd60acc919d7696e5ef98ee4da4e01d3f8cfa4cc5a"}, + {file = "cffi-1.17.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c03e868a0b3bc35839ba98e74211ed2b05d2119be4e8a0f224fba9384f1fe02e"}, + {file = "cffi-1.17.1-cp39-cp39-win32.whl", hash = "sha256:e31ae45bc2e29f6b2abd0de1cc3b9d5205aa847cafaecb8af1476a609a2f6eb7"}, + {file = "cffi-1.17.1-cp39-cp39-win_amd64.whl", hash = "sha256:d016c76bdd850f3c626af19b0542c9677ba156e4ee4fccfdd7848803533ef662"}, + {file = "cffi-1.17.1.tar.gz", hash = "sha256:1c39c6016c32bc48dd54561950ebd6836e1670f2ae46128f67cf49e789c52824"}, +] + +[package.dependencies] +pycparser = "*" + +[[package]] +name = "charset-normalizer" +version = "3.4.0" +description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:4f9fc98dad6c2eaa32fc3af1417d95b5e3d08aff968df0cd320066def971f9a6"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0de7b687289d3c1b3e8660d0741874abe7888100efe14bd0f9fd7141bcbda92b"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5ed2e36c3e9b4f21dd9422f6893dec0abf2cca553af509b10cd630f878d3eb99"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40d3ff7fc90b98c637bda91c89d51264a3dcf210cade3a2c6f838c7268d7a4ca"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1110e22af8ca26b90bd6364fe4c763329b0ebf1ee213ba32b68c73de5752323d"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:86f4e8cca779080f66ff4f191a685ced73d2f72d50216f7112185dc02b90b9b7"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f683ddc7eedd742e2889d2bfb96d69573fde1d92fcb811979cdb7165bb9c7d3"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:27623ba66c183eca01bf9ff833875b459cad267aeeb044477fedac35e19ba907"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:f606a1881d2663630ea5b8ce2efe2111740df4b687bd78b34a8131baa007f79b"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:0b309d1747110feb25d7ed6b01afdec269c647d382c857ef4663bbe6ad95a912"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:136815f06a3ae311fae551c3df1f998a1ebd01ddd424aa5603a4336997629e95"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:14215b71a762336254351b00ec720a8e85cada43b987da5a042e4ce3e82bd68e"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:79983512b108e4a164b9c8d34de3992f76d48cadc9554c9e60b43f308988aabe"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-win32.whl", hash = "sha256:c94057af19bc953643a33581844649a7fdab902624d2eb739738a30e2b3e60fc"}, + {file = "charset_normalizer-3.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:55f56e2ebd4e3bc50442fbc0888c9d8c94e4e06a933804e2af3e89e2f9c1c749"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0d99dd8ff461990f12d6e42c7347fd9ab2532fb70e9621ba520f9e8637161d7c"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:c57516e58fd17d03ebe67e181a4e4e2ccab1168f8c2976c6a334d4f819fe5944"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:6dba5d19c4dfab08e58d5b36304b3f92f3bd5d42c1a3fa37b5ba5cdf6dfcbcee"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bf4475b82be41b07cc5e5ff94810e6a01f276e37c2d55571e3fe175e467a1a1c"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ce031db0408e487fd2775d745ce30a7cd2923667cf3b69d48d219f1d8f5ddeb6"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8ff4e7cdfdb1ab5698e675ca622e72d58a6fa2a8aa58195de0c0061288e6e3ea"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3710a9751938947e6327ea9f3ea6332a09bf0ba0c09cae9cb1f250bd1f1549bc"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:82357d85de703176b5587dbe6ade8ff67f9f69a41c0733cf2425378b49954de5"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:47334db71978b23ebcf3c0f9f5ee98b8d65992b65c9c4f2d34c2eaf5bcaf0594"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:8ce7fd6767a1cc5a92a639b391891bf1c268b03ec7e021c7d6d902285259685c"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:f1a2f519ae173b5b6a2c9d5fa3116ce16e48b3462c8b96dfdded11055e3d6365"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:63bc5c4ae26e4bc6be6469943b8253c0fd4e4186c43ad46e713ea61a0ba49129"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:bcb4f8ea87d03bc51ad04add8ceaf9b0f085ac045ab4d74e73bbc2dc033f0236"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-win32.whl", hash = "sha256:9ae4ef0b3f6b41bad6366fb0ea4fc1d7ed051528e113a60fa2a65a9abb5b1d99"}, + {file = "charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cee4373f4d3ad28f1ab6290684d8e2ebdb9e7a1b74fdc39e4c211995f77bec27"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0713f3adb9d03d49d365b70b84775d0a0d18e4ab08d12bc46baa6132ba78aaf6"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:de7376c29d95d6719048c194a9cf1a1b0393fbe8488a22008610b0361d834ecf"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a51b48f42d9358460b78725283f04bddaf44a9358197b889657deba38f329db"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b295729485b06c1a0683af02a9e42d2caa9db04a373dc38a6a58cdd1e8abddf1"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee803480535c44e7f5ad00788526da7d85525cfefaf8acf8ab9a310000be4b03"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3d59d125ffbd6d552765510e3f31ed75ebac2c7470c7274195b9161a32350284"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cda06946eac330cbe6598f77bb54e690b4ca93f593dee1568ad22b04f347c15"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:07afec21bbbbf8a5cc3651aa96b980afe2526e7f048fdfb7f1014d84acc8b6d8"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6b40e8d38afe634559e398cc32b1472f376a4099c75fe6299ae607e404c033b2"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b8dcd239c743aa2f9c22ce674a145e0a25cb1566c495928440a181ca1ccf6719"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:84450ba661fb96e9fd67629b93d2941c871ca86fc38d835d19d4225ff946a631"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:44aeb140295a2f0659e113b31cfe92c9061622cadbc9e2a2f7b8ef6b1e29ef4b"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:1db4e7fefefd0f548d73e2e2e041f9df5c59e178b4c72fbac4cc6f535cfb1565"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-win32.whl", hash = "sha256:5726cf76c982532c1863fb64d8c6dd0e4c90b6ece9feb06c9f202417a31f7dd7"}, + {file = "charset_normalizer-3.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:b197e7094f232959f8f20541ead1d9862ac5ebea1d58e9849c1bf979255dfac9"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:dd4eda173a9fcccb5f2e2bd2a9f423d180194b1bf17cf59e3269899235b2a114"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e9e3c4c9e1ed40ea53acf11e2a386383c3304212c965773704e4603d589343ed"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:92a7e36b000bf022ef3dbb9c46bfe2d52c047d5e3f3343f43204263c5addc250"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:54b6a92d009cbe2fb11054ba694bc9e284dad30a26757b1e372a1fdddaf21920"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1ffd9493de4c922f2a38c2bf62b831dcec90ac673ed1ca182fe11b4d8e9f2a64"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:35c404d74c2926d0287fbd63ed5d27eb911eb9e4a3bb2c6d294f3cfd4a9e0c23"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4796efc4faf6b53a18e3d46343535caed491776a22af773f366534056c4e1fbc"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e7fdd52961feb4c96507aa649550ec2a0d527c086d284749b2f582f2d40a2e0d"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:92db3c28b5b2a273346bebb24857fda45601aef6ae1c011c0a997106581e8a88"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ab973df98fc99ab39080bfb0eb3a925181454d7c3ac8a1e695fddfae696d9e90"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:4b67fdab07fdd3c10bb21edab3cbfe8cf5696f453afce75d815d9d7223fbe88b"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:aa41e526a5d4a9dfcfbab0716c7e8a1b215abd3f3df5a45cf18a12721d31cb5d"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ffc519621dce0c767e96b9c53f09c5d215578e10b02c285809f76509a3931482"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-win32.whl", hash = "sha256:f19c1585933c82098c2a520f8ec1227f20e339e33aca8fa6f956f6691b784e67"}, + {file = "charset_normalizer-3.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:707b82d19e65c9bd28b81dde95249b07bf9f5b90ebe1ef17d9b57473f8a64b7b"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:dbe03226baf438ac4fda9e2d0715022fd579cb641c4cf639fa40d53b2fe6f3e2"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:dd9a8bd8900e65504a305bf8ae6fa9fbc66de94178c420791d0293702fce2df7"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b8831399554b92b72af5932cdbbd4ddc55c55f631bb13ff8fe4e6536a06c5c51"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a14969b8691f7998e74663b77b4c36c0337cb1df552da83d5c9004a93afdb574"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dcaf7c1524c0542ee2fc82cc8ec337f7a9f7edee2532421ab200d2b920fc97cf"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425c5f215d0eecee9a56cdb703203dda90423247421bf0d67125add85d0c4455"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:d5b054862739d276e09928de37c79ddeec42a6e1bfc55863be96a36ba22926f6"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:f3e73a4255342d4eb26ef6df01e3962e73aa29baa3124a8e824c5d3364a65748"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:2f6c34da58ea9c1a9515621f4d9ac379871a8f21168ba1b5e09d74250de5ad62"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:f09cb5a7bbe1ecae6e87901a2eb23e0256bb524a79ccc53eb0b7629fbe7677c4"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:0099d79bdfcf5c1f0c2c72f91516702ebf8b0b8ddd8905f97a8aecf49712c621"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-win32.whl", hash = "sha256:9c98230f5042f4945f957d006edccc2af1e03ed5e37ce7c373f00a5a4daa6149"}, + {file = "charset_normalizer-3.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:62f60aebecfc7f4b82e3f639a7d1433a20ec32824db2199a11ad4f5e146ef5ee"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:af73657b7a68211996527dbfeffbb0864e043d270580c5aef06dc4b659a4b578"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cab5d0b79d987c67f3b9e9c53f54a61360422a5a0bc075f43cab5621d530c3b6"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9289fd5dddcf57bab41d044f1756550f9e7cf0c8e373b8cdf0ce8773dc4bd417"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6b493a043635eb376e50eedf7818f2f322eabbaa974e948bd8bdd29eb7ef2a51"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9fa2566ca27d67c86569e8c85297aaf413ffab85a8960500f12ea34ff98e4c41"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a8e538f46104c815be19c975572d74afb53f29650ea2025bbfaef359d2de2f7f"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6fd30dc99682dc2c603c2b315bded2799019cea829f8bf57dc6b61efde6611c8"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2006769bd1640bdf4d5641c69a3d63b71b81445473cac5ded39740a226fa88ab"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:dc15e99b2d8a656f8e666854404f1ba54765871104e50c8e9813af8a7db07f12"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:ab2e5bef076f5a235c3774b4f4028a680432cded7cad37bba0fd90d64b187d19"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:4ec9dd88a5b71abfc74e9df5ebe7921c35cbb3b641181a531ca65cdb5e8e4dea"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:43193c5cda5d612f247172016c4bb71251c784d7a4d9314677186a838ad34858"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:aa693779a8b50cd97570e5a0f343538a8dbd3e496fa5dcb87e29406ad0299654"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-win32.whl", hash = "sha256:7706f5850360ac01d80c89bcef1640683cc12ed87f42579dab6c5d3ed6888613"}, + {file = "charset_normalizer-3.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:c3e446d253bd88f6377260d07c895816ebf33ffffd56c1c792b13bff9c3e1ade"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:980b4f289d1d90ca5efcf07958d3eb38ed9c0b7676bf2831a54d4f66f9c27dfa"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:f28f891ccd15c514a0981f3b9db9aa23d62fe1a99997512b0491d2ed323d229a"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8aacce6e2e1edcb6ac625fb0f8c3a9570ccc7bfba1f63419b3769ccf6a00ed0"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bd7af3717683bea4c87acd8c0d3d5b44d56120b26fd3f8a692bdd2d5260c620a"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ff2ed8194587faf56555927b3aa10e6fb69d931e33953943bc4f837dfee2242"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e91f541a85298cf35433bf66f3fab2a4a2cff05c127eeca4af174f6d497f0d4b"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:309a7de0a0ff3040acaebb35ec45d18db4b28232f21998851cfa709eeff49d62"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:285e96d9d53422efc0d7a17c60e59f37fbf3dfa942073f666db4ac71e8d726d0"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:5d447056e2ca60382d460a604b6302d8db69476fd2015c81e7c35417cfabe4cd"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:20587d20f557fe189b7947d8e7ec5afa110ccf72a3128d61a2a387c3313f46be"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:130272c698667a982a5d0e626851ceff662565379baf0ff2cc58067b81d4f11d"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:ab22fbd9765e6954bc0bcff24c25ff71dcbfdb185fcdaca49e81bac68fe724d3"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7782afc9b6b42200f7362858f9e73b1f8316afb276d316336c0ec3bd73312742"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-win32.whl", hash = "sha256:2de62e8801ddfff069cd5c504ce3bc9672b23266597d4e4f50eda28846c322f2"}, + {file = "charset_normalizer-3.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:95c3c157765b031331dd4db3c775e58deaee050a3042fcad72cbc4189d7c8dca"}, + {file = "charset_normalizer-3.4.0-py3-none-any.whl", hash = "sha256:fe9f97feb71aa9896b81973a7bbada8c49501dc73e58a10fcef6663af95e5079"}, + {file = "charset_normalizer-3.4.0.tar.gz", hash = "sha256:223217c3d4f82c3ac5e29032b3f1c2eb0fb591b72161f86d93f5719079dae93e"}, +] + +[[package]] +name = "click" +version = "8.1.7" +description = "Composable command line interface toolkit" +optional = false +python-versions = ">=3.7" +files = [ + {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, + {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[[package]] +name = "colorama" +version = "0.4.6" +description = "Cross-platform colored terminal text." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +files = [ + {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, + {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, +] + +[[package]] +name = "comm" +version = "0.2.2" +description = "Jupyter Python Comm implementation, for usage in ipykernel, xeus-python etc." +optional = false +python-versions = ">=3.8" +files = [ + {file = "comm-0.2.2-py3-none-any.whl", hash = "sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3"}, + {file = "comm-0.2.2.tar.gz", hash = "sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e"}, +] + +[package.dependencies] +traitlets = ">=4" + +[package.extras] +test = ["pytest"] + +[[package]] +name = "csscompressor" +version = "0.9.5" +description = "A python port of YUI CSS Compressor" +optional = false +python-versions = "*" +files = [ + {file = "csscompressor-0.9.5.tar.gz", hash = "sha256:afa22badbcf3120a4f392e4d22f9fff485c044a1feda4a950ecc5eba9dd31a05"}, +] + +[[package]] +name = "cssselect2" +version = "0.7.0" +description = "CSS selectors for Python ElementTree" +optional = false +python-versions = ">=3.7" +files = [ + {file = "cssselect2-0.7.0-py3-none-any.whl", hash = "sha256:fd23a65bfd444595913f02fc71f6b286c29261e354c41d722ca7a261a49b5969"}, + {file = "cssselect2-0.7.0.tar.gz", hash = "sha256:1ccd984dab89fc68955043aca4e1b03e0cf29cad9880f6e28e3ba7a74b14aa5a"}, +] + +[package.dependencies] +tinycss2 = "*" +webencodings = "*" + +[package.extras] +doc = ["sphinx", "sphinx_rtd_theme"] +test = ["flake8", "isort", "pytest"] + +[[package]] +name = "debugpy" +version = "1.8.7" +description = "An implementation of the Debug Adapter Protocol for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "debugpy-1.8.7-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:95fe04a573b8b22896c404365e03f4eda0ce0ba135b7667a1e57bd079793b96b"}, + {file = "debugpy-1.8.7-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:628a11f4b295ffb4141d8242a9bb52b77ad4a63a2ad19217a93be0f77f2c28c9"}, + {file = "debugpy-1.8.7-cp310-cp310-win32.whl", hash = "sha256:85ce9c1d0eebf622f86cc68618ad64bf66c4fc3197d88f74bb695a416837dd55"}, + {file = "debugpy-1.8.7-cp310-cp310-win_amd64.whl", hash = "sha256:29e1571c276d643757ea126d014abda081eb5ea4c851628b33de0c2b6245b037"}, + {file = "debugpy-1.8.7-cp311-cp311-macosx_14_0_universal2.whl", hash = "sha256:caf528ff9e7308b74a1749c183d6808ffbedbb9fb6af78b033c28974d9b8831f"}, + {file = "debugpy-1.8.7-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cba1d078cf2e1e0b8402e6bda528bf8fda7ccd158c3dba6c012b7897747c41a0"}, + {file = "debugpy-1.8.7-cp311-cp311-win32.whl", hash = "sha256:171899588bcd412151e593bd40d9907133a7622cd6ecdbdb75f89d1551df13c2"}, + {file = "debugpy-1.8.7-cp311-cp311-win_amd64.whl", hash = "sha256:6e1c4ffb0c79f66e89dfd97944f335880f0d50ad29525dc792785384923e2211"}, + {file = "debugpy-1.8.7-cp312-cp312-macosx_14_0_universal2.whl", hash = "sha256:4d27d842311353ede0ad572600c62e4bcd74f458ee01ab0dd3a1a4457e7e3706"}, + {file = "debugpy-1.8.7-cp312-cp312-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:703c1fd62ae0356e194f3e7b7a92acd931f71fe81c4b3be2c17a7b8a4b546ec2"}, + {file = "debugpy-1.8.7-cp312-cp312-win32.whl", hash = "sha256:2f729228430ef191c1e4df72a75ac94e9bf77413ce5f3f900018712c9da0aaca"}, + {file = "debugpy-1.8.7-cp312-cp312-win_amd64.whl", hash = "sha256:45c30aaefb3e1975e8a0258f5bbd26cd40cde9bfe71e9e5a7ac82e79bad64e39"}, + {file = "debugpy-1.8.7-cp313-cp313-macosx_14_0_universal2.whl", hash = "sha256:d050a1ec7e925f514f0f6594a1e522580317da31fbda1af71d1530d6ea1f2b40"}, + {file = "debugpy-1.8.7-cp313-cp313-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f2f4349a28e3228a42958f8ddaa6333d6f8282d5edaea456070e48609c5983b7"}, + {file = "debugpy-1.8.7-cp313-cp313-win32.whl", hash = "sha256:11ad72eb9ddb436afb8337891a986302e14944f0f755fd94e90d0d71e9100bba"}, + {file = "debugpy-1.8.7-cp313-cp313-win_amd64.whl", hash = "sha256:2efb84d6789352d7950b03d7f866e6d180284bc02c7e12cb37b489b7083d81aa"}, + {file = "debugpy-1.8.7-cp38-cp38-macosx_14_0_x86_64.whl", hash = "sha256:4b908291a1d051ef3331484de8e959ef3e66f12b5e610c203b5b75d2725613a7"}, + {file = "debugpy-1.8.7-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da8df5b89a41f1fd31503b179d0a84a5fdb752dddd5b5388dbd1ae23cda31ce9"}, + {file = "debugpy-1.8.7-cp38-cp38-win32.whl", hash = "sha256:b12515e04720e9e5c2216cc7086d0edadf25d7ab7e3564ec8b4521cf111b4f8c"}, + {file = "debugpy-1.8.7-cp38-cp38-win_amd64.whl", hash = "sha256:93176e7672551cb5281577cdb62c63aadc87ec036f0c6a486f0ded337c504596"}, + {file = "debugpy-1.8.7-cp39-cp39-macosx_14_0_x86_64.whl", hash = "sha256:90d93e4f2db442f8222dec5ec55ccfc8005821028982f1968ebf551d32b28907"}, + {file = "debugpy-1.8.7-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b6db2a370e2700557a976eaadb16243ec9c91bd46f1b3bb15376d7aaa7632c81"}, + {file = "debugpy-1.8.7-cp39-cp39-win32.whl", hash = "sha256:a6cf2510740e0c0b4a40330640e4b454f928c7b99b0c9dbf48b11efba08a8cda"}, + {file = "debugpy-1.8.7-cp39-cp39-win_amd64.whl", hash = "sha256:6a9d9d6d31846d8e34f52987ee0f1a904c7baa4912bf4843ab39dadf9b8f3e0d"}, + {file = "debugpy-1.8.7-py2.py3-none-any.whl", hash = "sha256:57b00de1c8d2c84a61b90880f7e5b6deaf4c312ecbde3a0e8912f2a56c4ac9ae"}, + {file = "debugpy-1.8.7.zip", hash = "sha256:18b8f731ed3e2e1df8e9cdaa23fb1fc9c24e570cd0081625308ec51c82efe42e"}, +] + +[[package]] +name = "decorator" +version = "5.1.1" +description = "Decorators for Humans" +optional = false +python-versions = ">=3.5" +files = [ + {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"}, + {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, +] + +[[package]] +name = "defusedxml" +version = "0.7.1" +description = "XML bomb protection for Python stdlib modules" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" +files = [ + {file = "defusedxml-0.7.1-py2.py3-none-any.whl", hash = "sha256:a352e7e428770286cc899e2542b6cdaedb2b4953ff269a210103ec58f6198a61"}, + {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, +] + +[[package]] +name = "editorconfig" +version = "0.12.4" +description = "EditorConfig File Locator and Interpreter for Python" +optional = false +python-versions = "*" +files = [ + {file = "EditorConfig-0.12.4.tar.gz", hash = "sha256:24857fa1793917dd9ccf0c7810a07e05404ce9b823521c7dce22a4fb5d125f80"}, +] + +[[package]] +name = "exceptiongroup" +version = "1.2.2" +description = "Backport of PEP 654 (exception groups)" +optional = false +python-versions = ">=3.7" +files = [ + {file = "exceptiongroup-1.2.2-py3-none-any.whl", hash = "sha256:3111b9d131c238bec2f8f516e123e14ba243563fb135d3fe885990585aa7795b"}, + {file = "exceptiongroup-1.2.2.tar.gz", hash = "sha256:47c2edf7c6738fafb49fd34290706d1a1a2f4d1c6df275526b62cbb4aa5393cc"}, +] + +[package.extras] +test = ["pytest (>=6)"] + +[[package]] +name = "executing" +version = "2.1.0" +description = "Get the currently executing AST node of a frame, and other information" +optional = false +python-versions = ">=3.8" +files = [ + {file = "executing-2.1.0-py2.py3-none-any.whl", hash = "sha256:8d63781349375b5ebccc3142f4b30350c0cd9c79f921cde38be2be4637e98eaf"}, + {file = "executing-2.1.0.tar.gz", hash = "sha256:8ea27ddd260da8150fa5a708269c4a10e76161e2496ec3e587da9e3c0fe4b9ab"}, +] + +[package.extras] +tests = ["asttokens (>=2.1.0)", "coverage", "coverage-enable-subprocess", "ipython", "littleutils", "pytest", "rich"] + +[[package]] +name = "fastjsonschema" +version = "2.20.0" +description = "Fastest Python implementation of JSON schema" +optional = false +python-versions = "*" +files = [ + {file = "fastjsonschema-2.20.0-py3-none-any.whl", hash = "sha256:5875f0b0fa7a0043a91e93a9b8f793bcbbba9691e7fd83dca95c28ba26d21f0a"}, + {file = "fastjsonschema-2.20.0.tar.gz", hash = "sha256:3d48fc5300ee96f5d116f10fe6f28d938e6008f59a6a025c2649475b87f76a23"}, +] + +[package.extras] +devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] + +[[package]] +name = "ghp-import" +version = "2.1.0" +description = "Copy your docs directly to the gh-pages branch." +optional = false +python-versions = "*" +files = [ + {file = "ghp-import-2.1.0.tar.gz", hash = "sha256:9c535c4c61193c2df8871222567d7fd7e5014d835f97dc7b7439069e2413d343"}, + {file = "ghp_import-2.1.0-py3-none-any.whl", hash = "sha256:8337dd7b50877f163d4c0289bc1f1c7f127550241988d568c1db512c4324a619"}, +] + +[package.dependencies] +python-dateutil = ">=2.8.1" + +[package.extras] +dev = ["flake8", "markdown", "twine", "wheel"] + +[[package]] +name = "gitdb" +version = "4.0.11" +description = "Git Object Database" +optional = false +python-versions = ">=3.7" +files = [ + {file = "gitdb-4.0.11-py3-none-any.whl", hash = "sha256:81a3407ddd2ee8df444cbacea00e2d038e40150acfa3001696fe0dcf1d3adfa4"}, + {file = "gitdb-4.0.11.tar.gz", hash = "sha256:bf5421126136d6d0af55bc1e7c1af1c397a34f5b7bd79e776cd3e89785c2b04b"}, +] + +[package.dependencies] +smmap = ">=3.0.1,<6" + +[[package]] +name = "gitpython" +version = "3.1.43" +description = "GitPython is a Python library used to interact with Git repositories" +optional = false +python-versions = ">=3.7" +files = [ + {file = "GitPython-3.1.43-py3-none-any.whl", hash = "sha256:eec7ec56b92aad751f9912a73404bc02ba212a23adb2c7098ee668417051a1ff"}, + {file = "GitPython-3.1.43.tar.gz", hash = "sha256:35f314a9f878467f5453cc1fee295c3e18e52f1b99f10f6cf5b1682e968a9e7c"}, +] + +[package.dependencies] +gitdb = ">=4.0.1,<5" + +[package.extras] +doc = ["sphinx (==4.3.2)", "sphinx-autodoc-typehints", "sphinx-rtd-theme", "sphinxcontrib-applehelp (>=1.0.2,<=1.0.4)", "sphinxcontrib-devhelp (==1.0.2)", "sphinxcontrib-htmlhelp (>=2.0.0,<=2.0.1)", "sphinxcontrib-qthelp (==1.0.3)", "sphinxcontrib-serializinghtml (==1.1.5)"] +test = ["coverage[toml]", "ddt (>=1.1.1,!=1.4.3)", "mock", "mypy", "pre-commit", "pytest (>=7.3.1)", "pytest-cov", "pytest-instafail", "pytest-mock", "pytest-sugar", "typing-extensions"] + +[[package]] +name = "griffe" +version = "1.4.1" +description = "Signatures for entire Python programs. Extract the structure, the frame, the skeleton of your project, to generate API documentation or find breaking changes in your API." +optional = false +python-versions = ">=3.9" +files = [ + {file = "griffe-1.4.1-py3-none-any.whl", hash = "sha256:84295ee0b27743bd880aea75632830ef02ded65d16124025e4c263bb826ab645"}, + {file = "griffe-1.4.1.tar.gz", hash = "sha256:911a201b01dc92e08c0e84c38a301e9da5ec067f00e7d9f2e39bc24dbfa3c176"}, +] + +[package.dependencies] +colorama = ">=0.4" + +[[package]] +name = "hjson" +version = "3.1.0" +description = "Hjson, a user interface for JSON." +optional = false +python-versions = "*" +files = [ + {file = "hjson-3.1.0-py3-none-any.whl", hash = "sha256:65713cdcf13214fb554eb8b4ef803419733f4f5e551047c9b711098ab7186b89"}, + {file = "hjson-3.1.0.tar.gz", hash = "sha256:55af475a27cf83a7969c808399d7bccdec8fb836a07ddbd574587593b9cdcf75"}, +] + +[[package]] +name = "htmlmin2" +version = "0.1.13" +description = "An HTML Minifier" +optional = false +python-versions = "*" +files = [ + {file = "htmlmin2-0.1.13-py3-none-any.whl", hash = "sha256:75609f2a42e64f7ce57dbff28a39890363bde9e7e5885db633317efbdf8c79a2"}, +] + +[[package]] +name = "idna" +version = "3.10" +description = "Internationalized Domain Names in Applications (IDNA)" +optional = false +python-versions = ">=3.6" +files = [ + {file = "idna-3.10-py3-none-any.whl", hash = "sha256:946d195a0d259cbba61165e88e65941f16e9b36ea6ddb97f00452bae8b1287d3"}, + {file = "idna-3.10.tar.gz", hash = "sha256:12f65c9b470abda6dc35cf8e63cc574b1c52b11df2c86030af0ac09b01b13ea9"}, +] + +[package.extras] +all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] + +[[package]] +name = "importlib-metadata" +version = "8.5.0" +description = "Read metadata from Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b"}, + {file = "importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7"}, +] + +[package.dependencies] +zipp = ">=3.20" + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +enabler = ["pytest-enabler (>=2.2)"] +perf = ["ipython"] +test = ["flufl.flake8", "importlib-resources (>=1.3)", "jaraco.test (>=5.4)", "packaging", "pyfakefs", "pytest (>=6,!=8.1.*)", "pytest-perf (>=0.9.2)"] +type = ["pytest-mypy"] + +[[package]] +name = "importlib-resources" +version = "6.4.5" +description = "Read resources from Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "importlib_resources-6.4.5-py3-none-any.whl", hash = "sha256:ac29d5f956f01d5e4bb63102a5a19957f1b9175e45649977264a1416783bb717"}, + {file = "importlib_resources-6.4.5.tar.gz", hash = "sha256:980862a1d16c9e147a59603677fa2aa5fd82b87f223b6cb870695bcfce830065"}, +] + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["jaraco.test (>=5.4)", "pytest (>=6,!=8.1.*)", "zipp (>=3.17)"] +type = ["pytest-mypy"] + +[[package]] +name = "ipykernel" +version = "6.29.5" +description = "IPython Kernel for Jupyter" +optional = false +python-versions = ">=3.8" +files = [ + {file = "ipykernel-6.29.5-py3-none-any.whl", hash = "sha256:afdb66ba5aa354b09b91379bac28ae4afebbb30e8b39510c9690afb7a10421b5"}, + {file = "ipykernel-6.29.5.tar.gz", hash = "sha256:f093a22c4a40f8828f8e330a9c297cb93dcab13bd9678ded6de8e5cf81c56215"}, +] + +[package.dependencies] +appnope = {version = "*", markers = "platform_system == \"Darwin\""} +comm = ">=0.1.1" +debugpy = ">=1.6.5" +ipython = ">=7.23.1" +jupyter-client = ">=6.1.12" +jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0" +matplotlib-inline = ">=0.1" +nest-asyncio = "*" +packaging = "*" +psutil = "*" +pyzmq = ">=24" +tornado = ">=6.1" +traitlets = ">=5.4.0" + +[package.extras] +cov = ["coverage[toml]", "curio", "matplotlib", "pytest-cov", "trio"] +docs = ["myst-parser", "pydata-sphinx-theme", "sphinx", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling", "trio"] +pyqt5 = ["pyqt5"] +pyside6 = ["pyside6"] +test = ["flaky", "ipyparallel", "pre-commit", "pytest (>=7.0)", "pytest-asyncio (>=0.23.5)", "pytest-cov", "pytest-timeout"] + +[[package]] +name = "ipython" +version = "8.28.0" +description = "IPython: Productive Interactive Computing" +optional = false +python-versions = ">=3.10" +files = [ + {file = "ipython-8.28.0-py3-none-any.whl", hash = "sha256:530ef1e7bb693724d3cdc37287c80b07ad9b25986c007a53aa1857272dac3f35"}, + {file = "ipython-8.28.0.tar.gz", hash = "sha256:0d0d15ca1e01faeb868ef56bc7ee5a0de5bd66885735682e8a322ae289a13d1a"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +decorator = "*" +exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} +jedi = ">=0.16" +matplotlib-inline = "*" +pexpect = {version = ">4.3", markers = "sys_platform != \"win32\" and sys_platform != \"emscripten\""} +prompt-toolkit = ">=3.0.41,<3.1.0" +pygments = ">=2.4.0" +stack-data = "*" +traitlets = ">=5.13.0" +typing-extensions = {version = ">=4.6", markers = "python_version < \"3.12\""} + +[package.extras] +all = ["ipython[black,doc,kernel,matplotlib,nbconvert,nbformat,notebook,parallel,qtconsole]", "ipython[test,test-extra]"] +black = ["black"] +doc = ["docrepr", "exceptiongroup", "intersphinx-registry", "ipykernel", "ipython[test]", "matplotlib", "setuptools (>=18.5)", "sphinx (>=1.3)", "sphinx-rtd-theme", "sphinxcontrib-jquery", "tomli", "typing-extensions"] +kernel = ["ipykernel"] +matplotlib = ["matplotlib"] +nbconvert = ["nbconvert"] +nbformat = ["nbformat"] +notebook = ["ipywidgets", "notebook"] +parallel = ["ipyparallel"] +qtconsole = ["qtconsole"] +test = ["packaging", "pickleshare", "pytest", "pytest-asyncio (<0.22)", "testpath"] +test-extra = ["curio", "ipython[test]", "matplotlib (!=3.2.0)", "nbformat", "numpy (>=1.23)", "pandas", "trio"] + +[[package]] +name = "jedi" +version = "0.19.1" +description = "An autocompletion tool for Python that can be used for text editors." +optional = false +python-versions = ">=3.6" +files = [ + {file = "jedi-0.19.1-py2.py3-none-any.whl", hash = "sha256:e983c654fe5c02867aef4cdfce5a2fbb4a50adc0af145f70504238f18ef5e7e0"}, + {file = "jedi-0.19.1.tar.gz", hash = "sha256:cf0496f3651bc65d7174ac1b7d043eff454892c708a87d1b683e57b569927ffd"}, +] + +[package.dependencies] +parso = ">=0.8.3,<0.9.0" + +[package.extras] +docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alabaster (==0.7.12)", "babel (==2.9.1)", "chardet (==4.0.0)", "commonmark (==0.8.1)", "docutils (==0.17.1)", "future (==0.18.2)", "idna (==2.10)", "imagesize (==1.2.0)", "mock (==1.0.1)", "packaging (==20.9)", "pyparsing (==2.4.7)", "pytz (==2021.1)", "readthedocs-sphinx-ext (==2.1.4)", "recommonmark (==0.5.0)", "requests (==2.25.1)", "six (==1.15.0)", "snowballstemmer (==2.1.0)", "sphinx (==1.8.5)", "sphinx-rtd-theme (==0.4.3)", "sphinxcontrib-serializinghtml (==1.1.4)", "sphinxcontrib-websupport (==1.2.4)", "urllib3 (==1.26.4)"] +qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] +testing = ["Django", "attrs", "colorama", "docopt", "pytest (<7.0.0)"] + +[[package]] +name = "jinja2" +version = "3.1.4" +description = "A very fast and expressive template engine." +optional = false +python-versions = ">=3.7" +files = [ + {file = "jinja2-3.1.4-py3-none-any.whl", hash = "sha256:bc5dd2abb727a5319567b7a813e6a2e7318c39f4f487cfe6c89c6f9c7d25197d"}, + {file = "jinja2-3.1.4.tar.gz", hash = "sha256:4a3aee7acbbe7303aede8e9648d13b8bf88a429282aa6122a993f0ac800cb369"}, +] + +[package.dependencies] +MarkupSafe = ">=2.0" + +[package.extras] +i18n = ["Babel (>=2.7)"] + +[[package]] +name = "jsbeautifier" +version = "1.15.1" +description = "JavaScript unobfuscator and beautifier." +optional = false +python-versions = "*" +files = [ + {file = "jsbeautifier-1.15.1.tar.gz", hash = "sha256:ebd733b560704c602d744eafc839db60a1ee9326e30a2a80c4adb8718adc1b24"}, +] + +[package.dependencies] +editorconfig = ">=0.12.2" +six = ">=1.13.0" + +[[package]] +name = "jsmin" +version = "3.0.1" +description = "JavaScript minifier." +optional = false +python-versions = "*" +files = [ + {file = "jsmin-3.0.1.tar.gz", hash = "sha256:c0959a121ef94542e807a674142606f7e90214a2b3d1eb17300244bbb5cc2bfc"}, +] + +[[package]] +name = "jsonschema" +version = "4.23.0" +description = "An implementation of JSON Schema validation for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566"}, + {file = "jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4"}, +] + +[package.dependencies] +attrs = ">=22.2.0" +jsonschema-specifications = ">=2023.03.6" +referencing = ">=0.28.4" +rpds-py = ">=0.7.1" + +[package.extras] +format = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3987", "uri-template", "webcolors (>=1.11)"] +format-nongpl = ["fqdn", "idna", "isoduration", "jsonpointer (>1.13)", "rfc3339-validator", "rfc3986-validator (>0.1.0)", "uri-template", "webcolors (>=24.6.0)"] + +[[package]] +name = "jsonschema-specifications" +version = "2024.10.1" +description = "The JSON Schema meta-schemas and vocabularies, exposed as a Registry" +optional = false +python-versions = ">=3.9" +files = [ + {file = "jsonschema_specifications-2024.10.1-py3-none-any.whl", hash = "sha256:a09a0680616357d9a0ecf05c12ad234479f549239d0f5b55f3deea67475da9bf"}, + {file = "jsonschema_specifications-2024.10.1.tar.gz", hash = "sha256:0f38b83639958ce1152d02a7f062902c41c8fd20d558b0c34344292d417ae272"}, +] + +[package.dependencies] +referencing = ">=0.31.0" + +[[package]] +name = "jupyter-client" +version = "8.6.3" +description = "Jupyter protocol implementation and client libraries" +optional = false +python-versions = ">=3.8" +files = [ + {file = "jupyter_client-8.6.3-py3-none-any.whl", hash = "sha256:e8a19cc986cc45905ac3362915f410f3af85424b4c0905e94fa5f2cb08e8f23f"}, + {file = "jupyter_client-8.6.3.tar.gz", hash = "sha256:35b3a0947c4a6e9d589eb97d7d4cd5e90f910ee73101611f01283732bd6d9419"}, +] + +[package.dependencies] +jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0" +python-dateutil = ">=2.8.2" +pyzmq = ">=23.0" +tornado = ">=6.2" +traitlets = ">=5.3" + +[package.extras] +docs = ["ipykernel", "myst-parser", "pydata-sphinx-theme", "sphinx (>=4)", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"] +test = ["coverage", "ipykernel (>=6.14)", "mypy", "paramiko", "pre-commit", "pytest (<8.2.0)", "pytest-cov", "pytest-jupyter[client] (>=0.4.1)", "pytest-timeout"] + +[[package]] +name = "jupyter-core" +version = "5.7.2" +description = "Jupyter core package. A base package on which Jupyter projects rely." +optional = false +python-versions = ">=3.8" +files = [ + {file = "jupyter_core-5.7.2-py3-none-any.whl", hash = "sha256:4f7315d2f6b4bcf2e3e7cb6e46772eba760ae459cd1f59d29eb57b0a01bd7409"}, + {file = "jupyter_core-5.7.2.tar.gz", hash = "sha256:aa5f8d32bbf6b431ac830496da7392035d6f61b4f54872f15c4bd2a9c3f536d9"}, +] + +[package.dependencies] +platformdirs = ">=2.5" +pywin32 = {version = ">=300", markers = "sys_platform == \"win32\" and platform_python_implementation != \"PyPy\""} +traitlets = ">=5.3" + +[package.extras] +docs = ["myst-parser", "pydata-sphinx-theme", "sphinx-autodoc-typehints", "sphinxcontrib-github-alt", "sphinxcontrib-spelling", "traitlets"] +test = ["ipykernel", "pre-commit", "pytest (<8)", "pytest-cov", "pytest-timeout"] + +[[package]] +name = "jupyterlab-pygments" +version = "0.3.0" +description = "Pygments theme using JupyterLab CSS variables" +optional = false +python-versions = ">=3.8" +files = [ + {file = "jupyterlab_pygments-0.3.0-py3-none-any.whl", hash = "sha256:841a89020971da1d8693f1a99997aefc5dc424bb1b251fd6322462a1b8842780"}, + {file = "jupyterlab_pygments-0.3.0.tar.gz", hash = "sha256:721aca4d9029252b11cfa9d185e5b5af4d54772bb8072f9b7036f4170054d35d"}, +] + +[[package]] +name = "jupytext" +version = "1.16.4" +description = "Jupyter notebooks as Markdown documents, Julia, Python or R scripts" +optional = false +python-versions = ">=3.8" +files = [ + {file = "jupytext-1.16.4-py3-none-any.whl", hash = "sha256:76989d2690e65667ea6fb411d8056abe7cd0437c07bd774660b83d62acf9490a"}, + {file = "jupytext-1.16.4.tar.gz", hash = "sha256:28e33f46f2ce7a41fb9d677a4a2c95327285579b64ca104437c4b9eb1e4174e9"}, +] + +[package.dependencies] +markdown-it-py = ">=1.0" +mdit-py-plugins = "*" +nbformat = "*" +packaging = "*" +pyyaml = "*" +tomli = {version = "*", markers = "python_version < \"3.11\""} + +[package.extras] +dev = ["autopep8", "black", "flake8", "gitpython", "ipykernel", "isort", "jupyter-fs (>=1.0)", "jupyter-server (!=2.11)", "nbconvert", "pre-commit", "pytest", "pytest-cov (>=2.6.1)", "pytest-randomly", "pytest-xdist", "sphinx-gallery (<0.8)"] +docs = ["myst-parser", "sphinx", "sphinx-copybutton", "sphinx-rtd-theme"] +test = ["pytest", "pytest-randomly", "pytest-xdist"] +test-cov = ["ipykernel", "jupyter-server (!=2.11)", "nbconvert", "pytest", "pytest-cov (>=2.6.1)", "pytest-randomly", "pytest-xdist"] +test-external = ["autopep8", "black", "flake8", "gitpython", "ipykernel", "isort", "jupyter-fs (>=1.0)", "jupyter-server (!=2.11)", "nbconvert", "pre-commit", "pytest", "pytest-randomly", "pytest-xdist", "sphinx-gallery (<0.8)"] +test-functional = ["pytest", "pytest-randomly", "pytest-xdist"] +test-integration = ["ipykernel", "jupyter-server (!=2.11)", "nbconvert", "pytest", "pytest-randomly", "pytest-xdist"] +test-ui = ["calysto-bash"] + +[[package]] +name = "markdown" +version = "3.7" +description = "Python implementation of John Gruber's Markdown." +optional = false +python-versions = ">=3.8" +files = [ + {file = "Markdown-3.7-py3-none-any.whl", hash = "sha256:7eb6df5690b81a1d7942992c97fad2938e956e79df20cbc6186e9c3a77b1c803"}, + {file = "markdown-3.7.tar.gz", hash = "sha256:2ae2471477cfd02dbbf038d5d9bc226d40def84b4fe2986e49b59b6b472bbed2"}, +] + +[package.extras] +docs = ["mdx-gh-links (>=0.2)", "mkdocs (>=1.5)", "mkdocs-gen-files", "mkdocs-literate-nav", "mkdocs-nature (>=0.6)", "mkdocs-section-index", "mkdocstrings[python]"] +testing = ["coverage", "pyyaml"] + +[[package]] +name = "markdown-it-py" +version = "3.0.0" +description = "Python port of markdown-it. Markdown parsing, done right!" +optional = false +python-versions = ">=3.8" +files = [ + {file = "markdown-it-py-3.0.0.tar.gz", hash = "sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb"}, + {file = "markdown_it_py-3.0.0-py3-none-any.whl", hash = "sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1"}, +] + +[package.dependencies] +mdurl = ">=0.1,<1.0" + +[package.extras] +benchmarking = ["psutil", "pytest", "pytest-benchmark"] +code-style = ["pre-commit (>=3.0,<4.0)"] +compare = ["commonmark (>=0.9,<1.0)", "markdown (>=3.4,<4.0)", "mistletoe (>=1.0,<2.0)", "mistune (>=2.0,<3.0)", "panflute (>=2.3,<3.0)"] +linkify = ["linkify-it-py (>=1,<3)"] +plugins = ["mdit-py-plugins"] +profiling = ["gprof2dot"] +rtd = ["jupyter_sphinx", "mdit-py-plugins", "myst-parser", "pyyaml", "sphinx", "sphinx-copybutton", "sphinx-design", "sphinx_book_theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "markupsafe" +version = "3.0.1" +description = "Safely add untrusted strings to HTML/XML markup." +optional = false +python-versions = ">=3.9" +files = [ + {file = "MarkupSafe-3.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:db842712984e91707437461930e6011e60b39136c7331e971952bb30465bc1a1"}, + {file = "MarkupSafe-3.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:3ffb4a8e7d46ed96ae48805746755fadd0909fea2306f93d5d8233ba23dda12a"}, + {file = "MarkupSafe-3.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:67c519635a4f64e495c50e3107d9b4075aec33634272b5db1cde839e07367589"}, + {file = "MarkupSafe-3.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:48488d999ed50ba8d38c581d67e496f955821dc183883550a6fbc7f1aefdc170"}, + {file = "MarkupSafe-3.0.1-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f31ae06f1328595d762c9a2bf29dafd8621c7d3adc130cbb46278079758779ca"}, + {file = "MarkupSafe-3.0.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:80fcbf3add8790caddfab6764bde258b5d09aefbe9169c183f88a7410f0f6dea"}, + {file = "MarkupSafe-3.0.1-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:3341c043c37d78cc5ae6e3e305e988532b072329639007fd408a476642a89fd6"}, + {file = "MarkupSafe-3.0.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:cb53e2a99df28eee3b5f4fea166020d3ef9116fdc5764bc5117486e6d1211b25"}, + {file = "MarkupSafe-3.0.1-cp310-cp310-win32.whl", hash = "sha256:db15ce28e1e127a0013dfb8ac243a8e392db8c61eae113337536edb28bdc1f97"}, + {file = "MarkupSafe-3.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:4ffaaac913c3f7345579db4f33b0020db693f302ca5137f106060316761beea9"}, + {file = "MarkupSafe-3.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:26627785a54a947f6d7336ce5963569b5d75614619e75193bdb4e06e21d447ad"}, + {file = "MarkupSafe-3.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b954093679d5750495725ea6f88409946d69cfb25ea7b4c846eef5044194f583"}, + {file = "MarkupSafe-3.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:973a371a55ce9ed333a3a0f8e0bcfae9e0d637711534bcb11e130af2ab9334e7"}, + {file = "MarkupSafe-3.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:244dbe463d5fb6d7ce161301a03a6fe744dac9072328ba9fc82289238582697b"}, + {file = "MarkupSafe-3.0.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d98e66a24497637dd31ccab090b34392dddb1f2f811c4b4cd80c230205c074a3"}, + {file = "MarkupSafe-3.0.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:ad91738f14eb8da0ff82f2acd0098b6257621410dcbd4df20aaa5b4233d75a50"}, + {file = "MarkupSafe-3.0.1-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:7044312a928a66a4c2a22644147bc61a199c1709712069a344a3fb5cfcf16915"}, + {file = "MarkupSafe-3.0.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:a4792d3b3a6dfafefdf8e937f14906a51bd27025a36f4b188728a73382231d91"}, + {file = "MarkupSafe-3.0.1-cp311-cp311-win32.whl", hash = "sha256:fa7d686ed9883f3d664d39d5a8e74d3c5f63e603c2e3ff0abcba23eac6542635"}, + {file = "MarkupSafe-3.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:9ba25a71ebf05b9bb0e2ae99f8bc08a07ee8e98c612175087112656ca0f5c8bf"}, + {file = "MarkupSafe-3.0.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:8ae369e84466aa70f3154ee23c1451fda10a8ee1b63923ce76667e3077f2b0c4"}, + {file = "MarkupSafe-3.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40f1e10d51c92859765522cbd79c5c8989f40f0419614bcdc5015e7b6bf97fc5"}, + {file = "MarkupSafe-3.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5a4cb365cb49b750bdb60b846b0c0bc49ed62e59a76635095a179d440540c346"}, + {file = "MarkupSafe-3.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ee3941769bd2522fe39222206f6dd97ae83c442a94c90f2b7a25d847d40f4729"}, + {file = "MarkupSafe-3.0.1-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62fada2c942702ef8952754abfc1a9f7658a4d5460fabe95ac7ec2cbe0d02abc"}, + {file = "MarkupSafe-3.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4c2d64fdba74ad16138300815cfdc6ab2f4647e23ced81f59e940d7d4a1469d9"}, + {file = "MarkupSafe-3.0.1-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:fb532dd9900381d2e8f48172ddc5a59db4c445a11b9fab40b3b786da40d3b56b"}, + {file = "MarkupSafe-3.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0f84af7e813784feb4d5e4ff7db633aba6c8ca64a833f61d8e4eade234ef0c38"}, + {file = "MarkupSafe-3.0.1-cp312-cp312-win32.whl", hash = "sha256:cbf445eb5628981a80f54087f9acdbf84f9b7d862756110d172993b9a5ae81aa"}, + {file = "MarkupSafe-3.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:a10860e00ded1dd0a65b83e717af28845bb7bd16d8ace40fe5531491de76b79f"}, + {file = "MarkupSafe-3.0.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:e81c52638315ff4ac1b533d427f50bc0afc746deb949210bc85f05d4f15fd772"}, + {file = "MarkupSafe-3.0.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:312387403cd40699ab91d50735ea7a507b788091c416dd007eac54434aee51da"}, + {file = "MarkupSafe-3.0.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ae99f31f47d849758a687102afdd05bd3d3ff7dbab0a8f1587981b58a76152a"}, + {file = "MarkupSafe-3.0.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c97ff7fedf56d86bae92fa0a646ce1a0ec7509a7578e1ed238731ba13aabcd1c"}, + {file = "MarkupSafe-3.0.1-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:a7420ceda262dbb4b8d839a4ec63d61c261e4e77677ed7c66c99f4e7cb5030dd"}, + {file = "MarkupSafe-3.0.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:45d42d132cff577c92bfba536aefcfea7e26efb975bd455db4e6602f5c9f45e7"}, + {file = "MarkupSafe-3.0.1-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:4c8817557d0de9349109acb38b9dd570b03cc5014e8aabf1cbddc6e81005becd"}, + {file = "MarkupSafe-3.0.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6a54c43d3ec4cf2a39f4387ad044221c66a376e58c0d0e971d47c475ba79c6b5"}, + {file = "MarkupSafe-3.0.1-cp313-cp313-win32.whl", hash = "sha256:c91b394f7601438ff79a4b93d16be92f216adb57d813a78be4446fe0f6bc2d8c"}, + {file = "MarkupSafe-3.0.1-cp313-cp313-win_amd64.whl", hash = "sha256:fe32482b37b4b00c7a52a07211b479653b7fe4f22b2e481b9a9b099d8a430f2f"}, + {file = "MarkupSafe-3.0.1-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:17b2aea42a7280db02ac644db1d634ad47dcc96faf38ab304fe26ba2680d359a"}, + {file = "MarkupSafe-3.0.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:852dc840f6d7c985603e60b5deaae1d89c56cb038b577f6b5b8c808c97580f1d"}, + {file = "MarkupSafe-3.0.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0778de17cff1acaeccc3ff30cd99a3fd5c50fc58ad3d6c0e0c4c58092b859396"}, + {file = "MarkupSafe-3.0.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:800100d45176652ded796134277ecb13640c1a537cad3b8b53da45aa96330453"}, + {file = "MarkupSafe-3.0.1-cp313-cp313t-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d06b24c686a34c86c8c1fba923181eae6b10565e4d80bdd7bc1c8e2f11247aa4"}, + {file = "MarkupSafe-3.0.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:33d1c36b90e570ba7785dacd1faaf091203d9942bc036118fab8110a401eb1a8"}, + {file = "MarkupSafe-3.0.1-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:beeebf760a9c1f4c07ef6a53465e8cfa776ea6a2021eda0d0417ec41043fe984"}, + {file = "MarkupSafe-3.0.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:bbde71a705f8e9e4c3e9e33db69341d040c827c7afa6789b14c6e16776074f5a"}, + {file = "MarkupSafe-3.0.1-cp313-cp313t-win32.whl", hash = "sha256:82b5dba6eb1bcc29cc305a18a3c5365d2af06ee71b123216416f7e20d2a84e5b"}, + {file = "MarkupSafe-3.0.1-cp313-cp313t-win_amd64.whl", hash = "sha256:730d86af59e0e43ce277bb83970530dd223bf7f2a838e086b50affa6ec5f9295"}, + {file = "MarkupSafe-3.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:4935dd7883f1d50e2ffecca0aa33dc1946a94c8f3fdafb8df5c330e48f71b132"}, + {file = "MarkupSafe-3.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:e9393357f19954248b00bed7c56f29a25c930593a77630c719653d51e7669c2a"}, + {file = "MarkupSafe-3.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:40621d60d0e58aa573b68ac5e2d6b20d44392878e0bfc159012a5787c4e35bc8"}, + {file = "MarkupSafe-3.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f94190df587738280d544971500b9cafc9b950d32efcb1fba9ac10d84e6aa4e6"}, + {file = "MarkupSafe-3.0.1-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b6a387d61fe41cdf7ea95b38e9af11cfb1a63499af2759444b99185c4ab33f5b"}, + {file = "MarkupSafe-3.0.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:8ad4ad1429cd4f315f32ef263c1342166695fad76c100c5d979c45d5570ed58b"}, + {file = "MarkupSafe-3.0.1-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:e24bfe89c6ac4c31792793ad9f861b8f6dc4546ac6dc8f1c9083c7c4f2b335cd"}, + {file = "MarkupSafe-3.0.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2a4b34a8d14649315c4bc26bbfa352663eb51d146e35eef231dd739d54a5430a"}, + {file = "MarkupSafe-3.0.1-cp39-cp39-win32.whl", hash = "sha256:242d6860f1fd9191aef5fae22b51c5c19767f93fb9ead4d21924e0bcb17619d8"}, + {file = "MarkupSafe-3.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:93e8248d650e7e9d49e8251f883eed60ecbc0e8ffd6349e18550925e31bd029b"}, + {file = "markupsafe-3.0.1.tar.gz", hash = "sha256:3e683ee4f5d0fa2dde4db77ed8dd8a876686e3fc417655c2ece9a90576905344"}, +] + +[[package]] +name = "matplotlib-inline" +version = "0.1.7" +description = "Inline Matplotlib backend for Jupyter" +optional = false +python-versions = ">=3.8" +files = [ + {file = "matplotlib_inline-0.1.7-py3-none-any.whl", hash = "sha256:df192d39a4ff8f21b1895d72e6a13f5fcc5099f00fa84384e0ea28c2cc0653ca"}, + {file = "matplotlib_inline-0.1.7.tar.gz", hash = "sha256:8423b23ec666be3d16e16b60bdd8ac4e86e840ebd1dd11a30b9f117f2fa0ab90"}, +] + +[package.dependencies] +traitlets = "*" + +[[package]] +name = "mdit-py-plugins" +version = "0.4.2" +description = "Collection of plugins for markdown-it-py" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mdit_py_plugins-0.4.2-py3-none-any.whl", hash = "sha256:0c673c3f889399a33b95e88d2f0d111b4447bdfea7f237dab2d488f459835636"}, + {file = "mdit_py_plugins-0.4.2.tar.gz", hash = "sha256:5f2cd1fdb606ddf152d37ec30e46101a60512bc0e5fa1a7002c36647b09e26b5"}, +] + +[package.dependencies] +markdown-it-py = ">=1.0.0,<4.0.0" + +[package.extras] +code-style = ["pre-commit"] +rtd = ["myst-parser", "sphinx-book-theme"] +testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] + +[[package]] +name = "mdurl" +version = "0.1.2" +description = "Markdown URL utilities" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mdurl-0.1.2-py3-none-any.whl", hash = "sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8"}, + {file = "mdurl-0.1.2.tar.gz", hash = "sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba"}, +] + +[[package]] +name = "mergedeep" +version = "1.3.4" +description = "A deep merge function for 🐍." +optional = false +python-versions = ">=3.6" +files = [ + {file = "mergedeep-1.3.4-py3-none-any.whl", hash = "sha256:70775750742b25c0d8f36c55aed03d24c3384d17c951b3175d898bd778ef0307"}, + {file = "mergedeep-1.3.4.tar.gz", hash = "sha256:0096d52e9dad9939c3d975a774666af186eda617e6ca84df4c94dec30004f2a8"}, +] + +[[package]] +name = "mike" +version = "2.1.3" +description = "Manage multiple versions of your MkDocs-powered documentation" +optional = false +python-versions = "*" +files = [ + {file = "mike-2.1.3-py3-none-any.whl", hash = "sha256:d90c64077e84f06272437b464735130d380703a76a5738b152932884c60c062a"}, + {file = "mike-2.1.3.tar.gz", hash = "sha256:abd79b8ea483fb0275b7972825d3082e5ae67a41820f8d8a0dc7a3f49944e810"}, +] + +[package.dependencies] +importlib-metadata = "*" +importlib-resources = "*" +jinja2 = ">=2.7" +mkdocs = ">=1.0" +pyparsing = ">=3.0" +pyyaml = ">=5.1" +pyyaml-env-tag = "*" +verspec = "*" + +[package.extras] +dev = ["coverage", "flake8 (>=3.0)", "flake8-quotes", "shtab"] +test = ["coverage", "flake8 (>=3.0)", "flake8-quotes", "shtab"] + +[[package]] +name = "mistune" +version = "3.0.2" +description = "A sane and fast Markdown parser with useful plugins and renderers" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mistune-3.0.2-py3-none-any.whl", hash = "sha256:71481854c30fdbc938963d3605b72501f5c10a9320ecd412c121c163a1c7d205"}, + {file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"}, +] + +[[package]] +name = "mkdocs" +version = "1.6.1" +description = "Project documentation with Markdown." +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs-1.6.1-py3-none-any.whl", hash = "sha256:db91759624d1647f3f34aa0c3f327dd2601beae39a366d6e064c03468d35c20e"}, + {file = "mkdocs-1.6.1.tar.gz", hash = "sha256:7b432f01d928c084353ab39c57282f29f92136665bdd6abf7c1ec8d822ef86f2"}, +] + +[package.dependencies] +click = ">=7.0" +colorama = {version = ">=0.4", markers = "platform_system == \"Windows\""} +ghp-import = ">=1.0" +jinja2 = ">=2.11.1" +markdown = ">=3.3.6" +markupsafe = ">=2.0.1" +mergedeep = ">=1.3.4" +mkdocs-get-deps = ">=0.2.0" +packaging = ">=20.5" +pathspec = ">=0.11.1" +pyyaml = ">=5.1" +pyyaml-env-tag = ">=0.1" +watchdog = ">=2.0" + +[package.extras] +i18n = ["babel (>=2.9.0)"] +min-versions = ["babel (==2.9.0)", "click (==7.0)", "colorama (==0.4)", "ghp-import (==1.0)", "importlib-metadata (==4.4)", "jinja2 (==2.11.1)", "markdown (==3.3.6)", "markupsafe (==2.0.1)", "mergedeep (==1.3.4)", "mkdocs-get-deps (==0.2.0)", "packaging (==20.5)", "pathspec (==0.11.1)", "pyyaml (==5.1)", "pyyaml-env-tag (==0.1)", "watchdog (==2.0)"] + +[[package]] +name = "mkdocs-autorefs" +version = "1.2.0" +description = "Automatically link across pages in MkDocs." +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs_autorefs-1.2.0-py3-none-any.whl", hash = "sha256:d588754ae89bd0ced0c70c06f58566a4ee43471eeeee5202427da7de9ef85a2f"}, + {file = "mkdocs_autorefs-1.2.0.tar.gz", hash = "sha256:a86b93abff653521bda71cf3fc5596342b7a23982093915cb74273f67522190f"}, +] + +[package.dependencies] +Markdown = ">=3.3" +markupsafe = ">=2.0.1" +mkdocs = ">=1.1" + +[[package]] +name = "mkdocs-awesome-pages-plugin" +version = "2.9.3" +description = "An MkDocs plugin that simplifies configuring page titles and their order" +optional = false +python-versions = ">=3.8.1" +files = [ + {file = "mkdocs_awesome_pages_plugin-2.9.3-py3-none-any.whl", hash = "sha256:1ba433d4e7edaf8661b15b93267f78f78e2e06ca590fc0e651ea36b191d64ae4"}, + {file = "mkdocs_awesome_pages_plugin-2.9.3.tar.gz", hash = "sha256:bdf6369871f41bb17f09c3cfb573367732dfcceb5673d7a2c5c76ac2567b242f"}, +] + +[package.dependencies] +mkdocs = ">=1" +natsort = ">=8.1.0" +wcmatch = ">=7" + +[[package]] +name = "mkdocs-breadcrumbs-plugin" +version = "0.1.10" +description = "Location-based breadcrumbs plugin for mkdocs." +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs_breadcrumbs_plugin-0.1.10-py3-none-any.whl", hash = "sha256:b3678f9e2acbe33f0720c001a4446fd545f0de3e58f8f3629e9f46a6f1c8d033"}, + {file = "mkdocs_breadcrumbs_plugin-0.1.10.tar.gz", hash = "sha256:36f902df21c6851e1c5108a588865d8098c58cc314d5a85ba2f3b8fb91e519bb"}, +] + +[package.dependencies] +mkdocs = ">=1.0.4" +mkdocs-material = "*" + +[[package]] +name = "mkdocs-get-deps" +version = "0.2.0" +description = "MkDocs extension that lists all dependencies according to a mkdocs.yml file" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs_get_deps-0.2.0-py3-none-any.whl", hash = "sha256:2bf11d0b133e77a0dd036abeeb06dec8775e46efa526dc70667d8863eefc6134"}, + {file = "mkdocs_get_deps-0.2.0.tar.gz", hash = "sha256:162b3d129c7fad9b19abfdcb9c1458a651628e4b1dea628ac68790fb3061c60c"}, +] + +[package.dependencies] +mergedeep = ">=1.3.4" +platformdirs = ">=2.2.0" +pyyaml = ">=5.1" + +[[package]] +name = "mkdocs-git-authors-plugin" +version = "0.9.0" +description = "Mkdocs plugin to display git authors of a page" +optional = false +python-versions = ">=3.7" +files = [ + {file = "mkdocs_git_authors_plugin-0.9.0-py3-none-any.whl", hash = "sha256:380730a05eeb947a7e84be05fdb1c5ae2a7bc70fd9f6eda941f187c87ae37052"}, + {file = "mkdocs_git_authors_plugin-0.9.0.tar.gz", hash = "sha256:6161f63b87064481a48d9ad01c23e43c3e758930c3a9cc167fe482909ceb9eac"}, +] + +[package.dependencies] +mkdocs = ">=1.0" + +[[package]] +name = "mkdocs-git-committers-plugin-2" +version = "2.4.1" +description = "An MkDocs plugin to create a list of contributors on the page. The git-committers plugin will seed the template context with a list of GitHub or GitLab committers and other useful GIT info such as last modified date" +optional = false +python-versions = "<4,>=3.8" +files = [ + {file = "mkdocs_git_committers_plugin_2-2.4.1-py3-none-any.whl", hash = "sha256:ec9c1d81445606c471337d1c4a1782c643b7377077b545279dc18b86b7362c6d"}, + {file = "mkdocs_git_committers_plugin_2-2.4.1.tar.gz", hash = "sha256:ea1f80a79cedc42289e0b8e973276df04fb94f56e0ae3efc5385fb28547cf5cb"}, +] + +[package.dependencies] +gitpython = "*" +mkdocs = ">=1.0.3" +requests = "*" + +[[package]] +name = "mkdocs-git-revision-date-localized-plugin" +version = "1.2.9" +description = "Mkdocs plugin that enables displaying the localized date of the last git modification of a markdown file." +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs_git_revision_date_localized_plugin-1.2.9-py3-none-any.whl", hash = "sha256:dea5c8067c23df30275702a1708885500fadf0abfb595b60e698bffc79c7a423"}, + {file = "mkdocs_git_revision_date_localized_plugin-1.2.9.tar.gz", hash = "sha256:df9a50873fba3a42ce9123885f8c53d589e90ef6c2443fe3280ef1e8d33c8f65"}, +] + +[package.dependencies] +babel = ">=2.7.0" +GitPython = "*" +mkdocs = ">=1.0" +pytz = "*" + +[package.extras] +all = ["GitPython", "babel (>=2.7.0)", "click", "codecov", "mkdocs (>=1.0)", "mkdocs-gen-files", "mkdocs-git-authors-plugin", "mkdocs-material", "mkdocs-static-i18n", "pytest", "pytest-cov", "pytz"] +base = ["GitPython", "babel (>=2.7.0)", "mkdocs (>=1.0)", "pytz"] +dev = ["click", "codecov", "mkdocs-gen-files", "mkdocs-git-authors-plugin", "mkdocs-material", "mkdocs-static-i18n", "pytest", "pytest-cov"] + +[[package]] +name = "mkdocs-include-markdown-plugin" +version = "6.2.2" +description = "Mkdocs Markdown includer plugin." +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs_include_markdown_plugin-6.2.2-py3-none-any.whl", hash = "sha256:d293950f6499d2944291ca7b9bc4a60e652bbfd3e3a42b564f6cceee268694e7"}, + {file = "mkdocs_include_markdown_plugin-6.2.2.tar.gz", hash = "sha256:f2bd5026650492a581d2fd44be6c22f90391910d76582b96a34c264f2d17875d"}, +] + +[package.dependencies] +mkdocs = ">=1.4" +wcmatch = "*" + +[package.extras] +cache = ["platformdirs"] + +[[package]] +name = "mkdocs-jupyter" +version = "0.25.0" +description = "Use Jupyter in mkdocs websites" +optional = false +python-versions = ">=3.9" +files = [ + {file = "mkdocs_jupyter-0.25.0-py3-none-any.whl", hash = "sha256:d83d71deef19f0401505945bf92ec3bd5b40615af89308e72d5112929f8ee00b"}, + {file = "mkdocs_jupyter-0.25.0.tar.gz", hash = "sha256:e26c1d341916bc57f96ea3f93d8d0a88fc77c87d4cee222f66d2007798d924f5"}, +] + +[package.dependencies] +ipykernel = ">6.0.0,<7.0.0" +jupytext = ">1.13.8,<2" +mkdocs = ">=1.4.0,<2" +mkdocs-material = ">9.0.0" +nbconvert = ">=7.2.9,<8" +pygments = ">2.12.0" + +[[package]] +name = "mkdocs-macros-plugin" +version = "1.3.5" +description = "Unleash the power of MkDocs with macros and variables" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs-macros-plugin-1.3.5.tar.gz", hash = "sha256:5fd6969e2c43e23031ffb719bebe7421163ea26f4dc360af2343144ca979b04b"}, + {file = "mkdocs_macros_plugin-1.3.5-py3-none-any.whl", hash = "sha256:58bd47ea7097d1a2824dc9d0d912c211823c5e6e6fe8a19a3ecf33346f7d6547"}, +] + +[package.dependencies] +hjson = "*" +jinja2 = "*" +mkdocs = ">=0.17" +packaging = "*" +pathspec = "*" +python-dateutil = "*" +pyyaml = "*" +super-collections = "*" +termcolor = "*" + +[package.extras] +test = ["mkdocs-include-markdown-plugin", "mkdocs-macros-test", "mkdocs-material (>=6.2)", "mkdocs-test"] + +[[package]] +name = "mkdocs-material" +version = "9.5.40" +description = "Documentation that simply works" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs_material-9.5.40-py3-none-any.whl", hash = "sha256:8e7a16ada34e79a7b6459ff2602584222f522c738b6a023d1bea853d5049da6f"}, + {file = "mkdocs_material-9.5.40.tar.gz", hash = "sha256:b69d70e667ec51fc41f65e006a3184dd00d95b2439d982cb1586e4c018943156"}, +] + +[package.dependencies] +babel = ">=2.10,<3.0" +cairosvg = {version = ">=2.6,<3.0", optional = true, markers = "extra == \"imaging\""} +colorama = ">=0.4,<1.0" +jinja2 = ">=3.0,<4.0" +markdown = ">=3.2,<4.0" +mkdocs = ">=1.6,<2.0" +mkdocs-material-extensions = ">=1.3,<2.0" +paginate = ">=0.5,<1.0" +pillow = {version = ">=10.2,<11.0", optional = true, markers = "extra == \"imaging\""} +pygments = ">=2.16,<3.0" +pymdown-extensions = ">=10.2,<11.0" +regex = ">=2022.4" +requests = ">=2.26,<3.0" + +[package.extras] +git = ["mkdocs-git-committers-plugin-2 (>=1.1,<2.0)", "mkdocs-git-revision-date-localized-plugin (>=1.2.4,<2.0)"] +imaging = ["cairosvg (>=2.6,<3.0)", "pillow (>=10.2,<11.0)"] +recommended = ["mkdocs-minify-plugin (>=0.7,<1.0)", "mkdocs-redirects (>=1.2,<2.0)", "mkdocs-rss-plugin (>=1.6,<2.0)"] + +[[package]] +name = "mkdocs-material-extensions" +version = "1.3.1" +description = "Extension pack for Python Markdown and MkDocs Material." +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs_material_extensions-1.3.1-py3-none-any.whl", hash = "sha256:adff8b62700b25cb77b53358dad940f3ef973dd6db797907c49e3c2ef3ab4e31"}, + {file = "mkdocs_material_extensions-1.3.1.tar.gz", hash = "sha256:10c9511cea88f568257f960358a467d12b970e1f7b2c0e5fb2bb48cab1928443"}, +] + +[[package]] +name = "mkdocs-mermaid2-plugin" +version = "1.1.1" +description = "A MkDocs plugin for including mermaid graphs in markdown sources" +optional = false +python-versions = ">=3.6" +files = [ + {file = "mkdocs-mermaid2-plugin-1.1.1.tar.gz", hash = "sha256:bea5f3cbe6cb76bad21b81e49a01e074427ed466666c5d404e62fe8698bc2d7c"}, + {file = "mkdocs_mermaid2_plugin-1.1.1-py3-none-any.whl", hash = "sha256:4e25876b59d1e151ca33a467207b346404b4a246f4f24af5e44c32408e175882"}, +] + +[package.dependencies] +beautifulsoup4 = ">=4.6.3" +jsbeautifier = "*" +mkdocs = ">=1.0.4" +pymdown-extensions = ">=8.0" +requests = "*" +setuptools = ">=18.5" + +[package.extras] +test = ["mkdocs-material"] + +[[package]] +name = "mkdocs-minify-plugin" +version = "0.8.0" +description = "An MkDocs plugin to minify HTML, JS or CSS files prior to being written to disk" +optional = false +python-versions = ">=3.8" +files = [ + {file = "mkdocs-minify-plugin-0.8.0.tar.gz", hash = "sha256:bc11b78b8120d79e817308e2b11539d790d21445eb63df831e393f76e52e753d"}, + {file = "mkdocs_minify_plugin-0.8.0-py3-none-any.whl", hash = "sha256:5fba1a3f7bd9a2142c9954a6559a57e946587b21f133165ece30ea145c66aee6"}, +] + +[package.dependencies] +csscompressor = ">=0.9.5" +htmlmin2 = ">=0.1.13" +jsmin = ">=3.0.1" +mkdocs = ">=1.4.1" + +[[package]] +name = "mkdocs-open-in-new-tab" +version = "1.0.6" +description = "MkDocs plugin to open outgoing links and PDFs in new tab." +optional = false +python-versions = ">=3.7" +files = [ + {file = "mkdocs_open_in_new_tab-1.0.6-py3-none-any.whl", hash = "sha256:c188d311b882567dd300b629ef7aa0d7835b4781216ab147a9111bf686ac9221"}, + {file = "mkdocs_open_in_new_tab-1.0.6.tar.gz", hash = "sha256:dd4389b04cc9029697e2398a3ddf1c47ff2ee7f4307112f691cf98ccf148d185"}, +] + +[package.dependencies] +mkdocs = "*" + +[package.extras] +dev = ["build (>=1.2.2,<1.3.0)", "mkdocs-git-revision-date-localized-plugin (>=1.2.6,<1.3.0)", "mkdocs-glightbox (>=0.4.0,<0.5.0)", "mkdocs-material (>=9.5.27,<9.6.0)", "setuptools (>=70.0.0,<70.1.0)", "twine (>=5.1.1,<5.2.0)"] + +[[package]] +name = "mkdocs-redirects" +version = "1.2.1" +description = "A MkDocs plugin for dynamic page redirects to prevent broken links." +optional = false +python-versions = ">=3.6" +files = [ + {file = "mkdocs-redirects-1.2.1.tar.gz", hash = "sha256:9420066d70e2a6bb357adf86e67023dcdca1857f97f07c7fe450f8f1fb42f861"}, + {file = "mkdocs_redirects-1.2.1-py3-none-any.whl", hash = "sha256:497089f9e0219e7389304cffefccdfa1cac5ff9509f2cb706f4c9b221726dffb"}, +] + +[package.dependencies] +mkdocs = ">=1.1.1" + +[package.extras] +dev = ["autoflake", "black", "isort", "pytest", "twine (>=1.13.0)"] +release = ["twine (>=1.13.0)"] +test = ["autoflake", "black", "isort", "pytest"] + +[[package]] +name = "mkdocstrings" +version = "0.26.2" +description = "Automatic documentation from sources, for MkDocs." +optional = false +python-versions = ">=3.9" +files = [ + {file = "mkdocstrings-0.26.2-py3-none-any.whl", hash = "sha256:1248f3228464f3b8d1a15bd91249ce1701fe3104ac517a5f167a0e01ca850ba5"}, + {file = "mkdocstrings-0.26.2.tar.gz", hash = "sha256:34a8b50f1e6cfd29546c6c09fbe02154adfb0b361bb758834bf56aa284ba876e"}, +] + +[package.dependencies] +click = ">=7.0" +Jinja2 = ">=2.11.1" +Markdown = ">=3.6" +MarkupSafe = ">=1.1" +mkdocs = ">=1.4" +mkdocs-autorefs = ">=1.2" +mkdocstrings-python = {version = ">=0.5.2", optional = true, markers = "extra == \"python\""} +platformdirs = ">=2.2" +pymdown-extensions = ">=6.3" + +[package.extras] +crystal = ["mkdocstrings-crystal (>=0.3.4)"] +python = ["mkdocstrings-python (>=0.5.2)"] +python-legacy = ["mkdocstrings-python-legacy (>=0.2.1)"] + +[[package]] +name = "mkdocstrings-python" +version = "1.12.1" +description = "A Python handler for mkdocstrings." +optional = false +python-versions = ">=3.9" +files = [ + {file = "mkdocstrings_python-1.12.1-py3-none-any.whl", hash = "sha256:205244488199c9aa2a39787ad6a0c862d39b74078ea9aa2be817bc972399563f"}, + {file = "mkdocstrings_python-1.12.1.tar.gz", hash = "sha256:60d6a5ca912c9af4ad431db6d0111ce9f79c6c48d33377dde6a05a8f5f48d792"}, +] + +[package.dependencies] +griffe = ">=0.49" +mkdocs-autorefs = ">=1.2" +mkdocstrings = ">=0.26" + +[[package]] +name = "natsort" +version = "8.4.0" +description = "Simple yet flexible natural sorting in Python." +optional = false +python-versions = ">=3.7" +files = [ + {file = "natsort-8.4.0-py3-none-any.whl", hash = "sha256:4732914fb471f56b5cce04d7bae6f164a592c7712e1c85f9ef585e197299521c"}, + {file = "natsort-8.4.0.tar.gz", hash = "sha256:45312c4a0e5507593da193dedd04abb1469253b601ecaf63445ad80f0a1ea581"}, +] + +[package.extras] +fast = ["fastnumbers (>=2.0.0)"] +icu = ["PyICU (>=1.0.0)"] + +[[package]] +name = "nbclient" +version = "0.10.0" +description = "A client library for executing notebooks. Formerly nbconvert's ExecutePreprocessor." +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "nbclient-0.10.0-py3-none-any.whl", hash = "sha256:f13e3529332a1f1f81d82a53210322476a168bb7090a0289c795fe9cc11c9d3f"}, + {file = "nbclient-0.10.0.tar.gz", hash = "sha256:4b3f1b7dba531e498449c4db4f53da339c91d449dc11e9af3a43b4eb5c5abb09"}, +] + +[package.dependencies] +jupyter-client = ">=6.1.12" +jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0" +nbformat = ">=5.1" +traitlets = ">=5.4" + +[package.extras] +dev = ["pre-commit"] +docs = ["autodoc-traits", "mock", "moto", "myst-parser", "nbclient[test]", "sphinx (>=1.7)", "sphinx-book-theme", "sphinxcontrib-spelling"] +test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>=7.0.0)", "pytest (>=7.0,<8)", "pytest-asyncio", "pytest-cov (>=4.0)", "testpath", "xmltodict"] + +[[package]] +name = "nbconvert" +version = "7.16.4" +description = "Converting Jupyter Notebooks (.ipynb files) to other formats. Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script. nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)." +optional = false +python-versions = ">=3.8" +files = [ + {file = "nbconvert-7.16.4-py3-none-any.whl", hash = "sha256:05873c620fe520b6322bf8a5ad562692343fe3452abda5765c7a34b7d1aa3eb3"}, + {file = "nbconvert-7.16.4.tar.gz", hash = "sha256:86ca91ba266b0a448dc96fa6c5b9d98affabde2867b363258703536807f9f7f4"}, +] + +[package.dependencies] +beautifulsoup4 = "*" +bleach = "!=5.0.0" +defusedxml = "*" +jinja2 = ">=3.0" +jupyter-core = ">=4.7" +jupyterlab-pygments = "*" +markupsafe = ">=2.0" +mistune = ">=2.0.3,<4" +nbclient = ">=0.5.0" +nbformat = ">=5.7" +packaging = "*" +pandocfilters = ">=1.4.1" +pygments = ">=2.4.1" +tinycss2 = "*" +traitlets = ">=5.1" + +[package.extras] +all = ["flaky", "ipykernel", "ipython", "ipywidgets (>=7.5)", "myst-parser", "nbsphinx (>=0.2.12)", "playwright", "pydata-sphinx-theme", "pyqtwebengine (>=5.15)", "pytest (>=7)", "sphinx (==5.0.2)", "sphinxcontrib-spelling", "tornado (>=6.1)"] +docs = ["ipykernel", "ipython", "myst-parser", "nbsphinx (>=0.2.12)", "pydata-sphinx-theme", "sphinx (==5.0.2)", "sphinxcontrib-spelling"] +qtpdf = ["pyqtwebengine (>=5.15)"] +qtpng = ["pyqtwebengine (>=5.15)"] +serve = ["tornado (>=6.1)"] +test = ["flaky", "ipykernel", "ipywidgets (>=7.5)", "pytest (>=7)"] +webpdf = ["playwright"] + +[[package]] +name = "nbformat" +version = "5.10.4" +description = "The Jupyter Notebook format" +optional = false +python-versions = ">=3.8" +files = [ + {file = "nbformat-5.10.4-py3-none-any.whl", hash = "sha256:3b48d6c8fbca4b299bf3982ea7db1af21580e4fec269ad087b9e81588891200b"}, + {file = "nbformat-5.10.4.tar.gz", hash = "sha256:322168b14f937a5d11362988ecac2a4952d3d8e3a2cbeb2319584631226d5b3a"}, +] + +[package.dependencies] +fastjsonschema = ">=2.15" +jsonschema = ">=2.6" +jupyter-core = ">=4.12,<5.0.dev0 || >=5.1.dev0" +traitlets = ">=5.1" + +[package.extras] +docs = ["myst-parser", "pydata-sphinx-theme", "sphinx", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"] +test = ["pep440", "pre-commit", "pytest", "testpath"] + +[[package]] +name = "nest-asyncio" +version = "1.6.0" +description = "Patch asyncio to allow nested event loops" +optional = false +python-versions = ">=3.5" +files = [ + {file = "nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c"}, + {file = "nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe"}, +] + +[[package]] +name = "packaging" +version = "24.1" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "packaging-24.1-py3-none-any.whl", hash = "sha256:5b8f2217dbdbd2f7f384c41c628544e6d52f2d0f53c6d0c3ea61aa5d1d7ff124"}, + {file = "packaging-24.1.tar.gz", hash = "sha256:026ed72c8ed3fcce5bf8950572258698927fd1dbda10a5e981cdf0ac37f4f002"}, +] + +[[package]] +name = "paginate" +version = "0.5.7" +description = "Divides large result sets into pages for easier browsing" +optional = false +python-versions = "*" +files = [ + {file = "paginate-0.5.7-py2.py3-none-any.whl", hash = "sha256:b885e2af73abcf01d9559fd5216b57ef722f8c42affbb63942377668e35c7591"}, + {file = "paginate-0.5.7.tar.gz", hash = "sha256:22bd083ab41e1a8b4f3690544afb2c60c25e5c9a63a30fa2f483f6c60c8e5945"}, +] + +[package.extras] +dev = ["pytest", "tox"] +lint = ["black"] + +[[package]] +name = "pandocfilters" +version = "1.5.1" +description = "Utilities for writing pandoc filters in python" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" +files = [ + {file = "pandocfilters-1.5.1-py2.py3-none-any.whl", hash = "sha256:93be382804a9cdb0a7267585f157e5d1731bbe5545a85b268d6f5fe6232de2bc"}, + {file = "pandocfilters-1.5.1.tar.gz", hash = "sha256:002b4a555ee4ebc03f8b66307e287fa492e4a77b4ea14d3f934328297bb4939e"}, +] + +[[package]] +name = "parso" +version = "0.8.4" +description = "A Python Parser" +optional = false +python-versions = ">=3.6" +files = [ + {file = "parso-0.8.4-py2.py3-none-any.whl", hash = "sha256:a418670a20291dacd2dddc80c377c5c3791378ee1e8d12bffc35420643d43f18"}, + {file = "parso-0.8.4.tar.gz", hash = "sha256:eb3a7b58240fb99099a345571deecc0f9540ea5f4dd2fe14c2a99d6b281ab92d"}, +] + +[package.extras] +qa = ["flake8 (==5.0.4)", "mypy (==0.971)", "types-setuptools (==67.2.0.1)"] +testing = ["docopt", "pytest"] + +[[package]] +name = "pathspec" +version = "0.12.1" +description = "Utility library for gitignore style pattern matching of file paths." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, + {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, +] + +[[package]] +name = "pexpect" +version = "4.9.0" +description = "Pexpect allows easy control of interactive console applications." +optional = false +python-versions = "*" +files = [ + {file = "pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523"}, + {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"}, +] + +[package.dependencies] +ptyprocess = ">=0.5" + +[[package]] +name = "pillow" +version = "10.4.0" +description = "Python Imaging Library (Fork)" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pillow-10.4.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:4d9667937cfa347525b319ae34375c37b9ee6b525440f3ef48542fcf66f2731e"}, + {file = "pillow-10.4.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:543f3dc61c18dafb755773efc89aae60d06b6596a63914107f75459cf984164d"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7928ecbf1ece13956b95d9cbcfc77137652b02763ba384d9ab508099a2eca856"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e4d49b85c4348ea0b31ea63bc75a9f3857869174e2bf17e7aba02945cd218e6f"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:6c762a5b0997f5659a5ef2266abc1d8851ad7749ad9a6a5506eb23d314e4f46b"}, + {file = "pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a985e028fc183bf12a77a8bbf36318db4238a3ded7fa9df1b9a133f1cb79f8fc"}, + {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:812f7342b0eee081eaec84d91423d1b4650bb9828eb53d8511bcef8ce5aecf1e"}, + {file = "pillow-10.4.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ac1452d2fbe4978c2eec89fb5a23b8387aba707ac72810d9490118817d9c0b46"}, + {file = "pillow-10.4.0-cp310-cp310-win32.whl", hash = "sha256:bcd5e41a859bf2e84fdc42f4edb7d9aba0a13d29a2abadccafad99de3feff984"}, + {file = "pillow-10.4.0-cp310-cp310-win_amd64.whl", hash = "sha256:ecd85a8d3e79cd7158dec1c9e5808e821feea088e2f69a974db5edf84dc53141"}, + {file = "pillow-10.4.0-cp310-cp310-win_arm64.whl", hash = "sha256:ff337c552345e95702c5fde3158acb0625111017d0e5f24bf3acdb9cc16b90d1"}, + {file = "pillow-10.4.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:0a9ec697746f268507404647e531e92889890a087e03681a3606d9b920fbee3c"}, + {file = "pillow-10.4.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:dfe91cb65544a1321e631e696759491ae04a2ea11d36715eca01ce07284738be"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5dc6761a6efc781e6a1544206f22c80c3af4c8cf461206d46a1e6006e4429ff3"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e84b6cc6a4a3d76c153a6b19270b3526a5a8ed6b09501d3af891daa2a9de7d6"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:bbc527b519bd3aa9d7f429d152fea69f9ad37c95f0b02aebddff592688998abe"}, + {file = "pillow-10.4.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:76a911dfe51a36041f2e756b00f96ed84677cdeb75d25c767f296c1c1eda1319"}, + {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:59291fb29317122398786c2d44427bbd1a6d7ff54017075b22be9d21aa59bd8d"}, + {file = "pillow-10.4.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:416d3a5d0e8cfe4f27f574362435bc9bae57f679a7158e0096ad2beb427b8696"}, + {file = "pillow-10.4.0-cp311-cp311-win32.whl", hash = "sha256:7086cc1d5eebb91ad24ded9f58bec6c688e9f0ed7eb3dbbf1e4800280a896496"}, + {file = "pillow-10.4.0-cp311-cp311-win_amd64.whl", hash = "sha256:cbed61494057c0f83b83eb3a310f0bf774b09513307c434d4366ed64f4128a91"}, + {file = "pillow-10.4.0-cp311-cp311-win_arm64.whl", hash = "sha256:f5f0c3e969c8f12dd2bb7e0b15d5c468b51e5017e01e2e867335c81903046a22"}, + {file = "pillow-10.4.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:673655af3eadf4df6b5457033f086e90299fdd7a47983a13827acf7459c15d94"}, + {file = "pillow-10.4.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:866b6942a92f56300012f5fbac71f2d610312ee65e22f1aa2609e491284e5597"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:29dbdc4207642ea6aad70fbde1a9338753d33fb23ed6956e706936706f52dd80"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bf2342ac639c4cf38799a44950bbc2dfcb685f052b9e262f446482afaf4bffca"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f5b92f4d70791b4a67157321c4e8225d60b119c5cc9aee8ecf153aace4aad4ef"}, + {file = "pillow-10.4.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:86dcb5a1eb778d8b25659d5e4341269e8590ad6b4e8b44d9f4b07f8d136c414a"}, + {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:780c072c2e11c9b2c7ca37f9a2ee8ba66f44367ac3e5c7832afcfe5104fd6d1b"}, + {file = "pillow-10.4.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:37fb69d905be665f68f28a8bba3c6d3223c8efe1edf14cc4cfa06c241f8c81d9"}, + {file = "pillow-10.4.0-cp312-cp312-win32.whl", hash = "sha256:7dfecdbad5c301d7b5bde160150b4db4c659cee2b69589705b6f8a0c509d9f42"}, + {file = "pillow-10.4.0-cp312-cp312-win_amd64.whl", hash = "sha256:1d846aea995ad352d4bdcc847535bd56e0fd88d36829d2c90be880ef1ee4668a"}, + {file = "pillow-10.4.0-cp312-cp312-win_arm64.whl", hash = "sha256:e553cad5179a66ba15bb18b353a19020e73a7921296a7979c4a2b7f6a5cd57f9"}, + {file = "pillow-10.4.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8bc1a764ed8c957a2e9cacf97c8b2b053b70307cf2996aafd70e91a082e70df3"}, + {file = "pillow-10.4.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:6209bb41dc692ddfee4942517c19ee81b86c864b626dbfca272ec0f7cff5d9fb"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bee197b30783295d2eb680b311af15a20a8b24024a19c3a26431ff83eb8d1f70"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1ef61f5dd14c300786318482456481463b9d6b91ebe5ef12f405afbba77ed0be"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:297e388da6e248c98bc4a02e018966af0c5f92dfacf5a5ca22fa01cb3179bca0"}, + {file = "pillow-10.4.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:e4db64794ccdf6cb83a59d73405f63adbe2a1887012e308828596100a0b2f6cc"}, + {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd2880a07482090a3bcb01f4265f1936a903d70bc740bfcb1fd4e8a2ffe5cf5a"}, + {file = "pillow-10.4.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4b35b21b819ac1dbd1233317adeecd63495f6babf21b7b2512d244ff6c6ce309"}, + {file = "pillow-10.4.0-cp313-cp313-win32.whl", hash = "sha256:551d3fd6e9dc15e4c1eb6fc4ba2b39c0c7933fa113b220057a34f4bb3268a060"}, + {file = "pillow-10.4.0-cp313-cp313-win_amd64.whl", hash = "sha256:030abdbe43ee02e0de642aee345efa443740aa4d828bfe8e2eb11922ea6a21ea"}, + {file = "pillow-10.4.0-cp313-cp313-win_arm64.whl", hash = "sha256:5b001114dd152cfd6b23befeb28d7aee43553e2402c9f159807bf55f33af8a8d"}, + {file = "pillow-10.4.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:8d4d5063501b6dd4024b8ac2f04962d661222d120381272deea52e3fc52d3736"}, + {file = "pillow-10.4.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7c1ee6f42250df403c5f103cbd2768a28fe1a0ea1f0f03fe151c8741e1469c8b"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b15e02e9bb4c21e39876698abf233c8c579127986f8207200bc8a8f6bb27acf2"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7a8d4bade9952ea9a77d0c3e49cbd8b2890a399422258a77f357b9cc9be8d680"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:43efea75eb06b95d1631cb784aa40156177bf9dd5b4b03ff38979e048258bc6b"}, + {file = "pillow-10.4.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:950be4d8ba92aca4b2bb0741285a46bfae3ca699ef913ec8416c1b78eadd64cd"}, + {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:d7480af14364494365e89d6fddc510a13e5a2c3584cb19ef65415ca57252fb84"}, + {file = "pillow-10.4.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:73664fe514b34c8f02452ffb73b7a92c6774e39a647087f83d67f010eb9a0cf0"}, + {file = "pillow-10.4.0-cp38-cp38-win32.whl", hash = "sha256:e88d5e6ad0d026fba7bdab8c3f225a69f063f116462c49892b0149e21b6c0a0e"}, + {file = "pillow-10.4.0-cp38-cp38-win_amd64.whl", hash = "sha256:5161eef006d335e46895297f642341111945e2c1c899eb406882a6c61a4357ab"}, + {file = "pillow-10.4.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:0ae24a547e8b711ccaaf99c9ae3cd975470e1a30caa80a6aaee9a2f19c05701d"}, + {file = "pillow-10.4.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:298478fe4f77a4408895605f3482b6cc6222c018b2ce565c2b6b9c354ac3229b"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:134ace6dc392116566980ee7436477d844520a26a4b1bd4053f6f47d096997fd"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:930044bb7679ab003b14023138b50181899da3f25de50e9dbee23b61b4de2126"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:c76e5786951e72ed3686e122d14c5d7012f16c8303a674d18cdcd6d89557fc5b"}, + {file = "pillow-10.4.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:b2724fdb354a868ddf9a880cb84d102da914e99119211ef7ecbdc613b8c96b3c"}, + {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dbc6ae66518ab3c5847659e9988c3b60dc94ffb48ef9168656e0019a93dbf8a1"}, + {file = "pillow-10.4.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:06b2f7898047ae93fad74467ec3d28fe84f7831370e3c258afa533f81ef7f3df"}, + {file = "pillow-10.4.0-cp39-cp39-win32.whl", hash = "sha256:7970285ab628a3779aecc35823296a7869f889b8329c16ad5a71e4901a3dc4ef"}, + {file = "pillow-10.4.0-cp39-cp39-win_amd64.whl", hash = "sha256:961a7293b2457b405967af9c77dcaa43cc1a8cd50d23c532e62d48ab6cdd56f5"}, + {file = "pillow-10.4.0-cp39-cp39-win_arm64.whl", hash = "sha256:32cda9e3d601a52baccb2856b8ea1fc213c90b340c542dcef77140dfa3278a9e"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5b4815f2e65b30f5fbae9dfffa8636d992d49705723fe86a3661806e069352d4"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:8f0aef4ef59694b12cadee839e2ba6afeab89c0f39a3adc02ed51d109117b8da"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f4727572e2918acaa9077c919cbbeb73bd2b3ebcfe033b72f858fc9fbef0026"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ff25afb18123cea58a591ea0244b92eb1e61a1fd497bf6d6384f09bc3262ec3e"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:dc3e2db6ba09ffd7d02ae9141cfa0ae23393ee7687248d46a7507b75d610f4f5"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:02a2be69f9c9b8c1e97cf2713e789d4e398c751ecfd9967c18d0ce304efbf885"}, + {file = "pillow-10.4.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:0755ffd4a0c6f267cccbae2e9903d95477ca2f77c4fcf3a3a09570001856c8a5"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:a02364621fe369e06200d4a16558e056fe2805d3468350df3aef21e00d26214b"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:1b5dea9831a90e9d0721ec417a80d4cbd7022093ac38a568db2dd78363b00908"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9b885f89040bb8c4a1573566bbb2f44f5c505ef6e74cec7ab9068c900047f04b"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87dd88ded2e6d74d31e1e0a99a726a6765cda32d00ba72dc37f0651f306daaa8"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:2db98790afc70118bd0255c2eeb465e9767ecf1f3c25f9a1abb8ffc8cfd1fe0a"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:f7baece4ce06bade126fb84b8af1c33439a76d8a6fd818970215e0560ca28c27"}, + {file = "pillow-10.4.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:cfdd747216947628af7b259d274771d84db2268ca062dd5faf373639d00113a3"}, + {file = "pillow-10.4.0.tar.gz", hash = "sha256:166c1cd4d24309b30d61f79f4a9114b7b2313d7450912277855ff5dfd7cd4a06"}, +] + +[package.extras] +docs = ["furo", "olefile", "sphinx (>=7.3)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinxext-opengraph"] +fpx = ["olefile"] +mic = ["olefile"] +tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] +typing = ["typing-extensions"] +xmp = ["defusedxml"] + +[[package]] +name = "platformdirs" +version = "4.3.6" +description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." +optional = false +python-versions = ">=3.8" +files = [ + {file = "platformdirs-4.3.6-py3-none-any.whl", hash = "sha256:73e575e1408ab8103900836b97580d5307456908a03e92031bab39e4554cc3fb"}, + {file = "platformdirs-4.3.6.tar.gz", hash = "sha256:357fb2acbc885b0419afd3ce3ed34564c13c9b95c89360cd9563f73aa5e2b907"}, +] + +[package.extras] +docs = ["furo (>=2024.8.6)", "proselint (>=0.14)", "sphinx (>=8.0.2)", "sphinx-autodoc-typehints (>=2.4)"] +test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=8.3.2)", "pytest-cov (>=5)", "pytest-mock (>=3.14)"] +type = ["mypy (>=1.11.2)"] + +[[package]] +name = "prompt-toolkit" +version = "3.0.48" +description = "Library for building powerful interactive command lines in Python" +optional = false +python-versions = ">=3.7.0" +files = [ + {file = "prompt_toolkit-3.0.48-py3-none-any.whl", hash = "sha256:f49a827f90062e411f1ce1f854f2aedb3c23353244f8108b89283587397ac10e"}, + {file = "prompt_toolkit-3.0.48.tar.gz", hash = "sha256:d6623ab0477a80df74e646bdbc93621143f5caf104206aa29294d53de1a03d90"}, +] + +[package.dependencies] +wcwidth = "*" + +[[package]] +name = "psutil" +version = "6.0.0" +description = "Cross-platform lib for process and system monitoring in Python." +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,>=2.7" +files = [ + {file = "psutil-6.0.0-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:a021da3e881cd935e64a3d0a20983bda0bb4cf80e4f74fa9bfcb1bc5785360c6"}, + {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_i686.whl", hash = "sha256:1287c2b95f1c0a364d23bc6f2ea2365a8d4d9b726a3be7294296ff7ba97c17f0"}, + {file = "psutil-6.0.0-cp27-cp27m-manylinux2010_x86_64.whl", hash = "sha256:a9a3dbfb4de4f18174528d87cc352d1f788b7496991cca33c6996f40c9e3c92c"}, + {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_i686.whl", hash = "sha256:6ec7588fb3ddaec7344a825afe298db83fe01bfaaab39155fa84cf1c0d6b13c3"}, + {file = "psutil-6.0.0-cp27-cp27mu-manylinux2010_x86_64.whl", hash = "sha256:1e7c870afcb7d91fdea2b37c24aeb08f98b6d67257a5cb0a8bc3ac68d0f1a68c"}, + {file = "psutil-6.0.0-cp27-none-win32.whl", hash = "sha256:02b69001f44cc73c1c5279d02b30a817e339ceb258ad75997325e0e6169d8b35"}, + {file = "psutil-6.0.0-cp27-none-win_amd64.whl", hash = "sha256:21f1fb635deccd510f69f485b87433460a603919b45e2a324ad65b0cc74f8fb1"}, + {file = "psutil-6.0.0-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:c588a7e9b1173b6e866756dde596fd4cad94f9399daf99ad8c3258b3cb2b47a0"}, + {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ed2440ada7ef7d0d608f20ad89a04ec47d2d3ab7190896cd62ca5fc4fe08bf0"}, + {file = "psutil-6.0.0-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fd9a97c8e94059b0ef54a7d4baf13b405011176c3b6ff257c247cae0d560ecd"}, + {file = "psutil-6.0.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2e8d0054fc88153ca0544f5c4d554d42e33df2e009c4ff42284ac9ebdef4132"}, + {file = "psutil-6.0.0-cp36-cp36m-win32.whl", hash = "sha256:fc8c9510cde0146432bbdb433322861ee8c3efbf8589865c8bf8d21cb30c4d14"}, + {file = "psutil-6.0.0-cp36-cp36m-win_amd64.whl", hash = "sha256:34859b8d8f423b86e4385ff3665d3f4d94be3cdf48221fbe476e883514fdb71c"}, + {file = "psutil-6.0.0-cp37-abi3-win32.whl", hash = "sha256:a495580d6bae27291324fe60cea0b5a7c23fa36a7cd35035a16d93bdcf076b9d"}, + {file = "psutil-6.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:33ea5e1c975250a720b3a6609c490db40dae5d83a4eb315170c4fe0d8b1f34b3"}, + {file = "psutil-6.0.0-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:ffe7fc9b6b36beadc8c322f84e1caff51e8703b88eee1da46d1e3a6ae11b4fd0"}, + {file = "psutil-6.0.0.tar.gz", hash = "sha256:8faae4f310b6d969fa26ca0545338b21f73c6b15db7c4a8d934a5482faa818f2"}, +] + +[package.extras] +test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] + +[[package]] +name = "ptyprocess" +version = "0.7.0" +description = "Run a subprocess in a pseudo terminal" +optional = false +python-versions = "*" +files = [ + {file = "ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35"}, + {file = "ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220"}, +] + +[[package]] +name = "pure-eval" +version = "0.2.3" +description = "Safely evaluate AST nodes without side effects" +optional = false +python-versions = "*" +files = [ + {file = "pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0"}, + {file = "pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42"}, +] + +[package.extras] +tests = ["pytest"] + +[[package]] +name = "pycparser" +version = "2.22" +description = "C parser in Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pycparser-2.22-py3-none-any.whl", hash = "sha256:c3702b6d3dd8c7abc1afa565d7e63d53a1d0bd86cdc24edd75470f4de499cfcc"}, + {file = "pycparser-2.22.tar.gz", hash = "sha256:491c8be9c040f5390f5bf44a5b07752bd07f56edf992381b05c701439eec10f6"}, +] + +[[package]] +name = "pygments" +version = "2.18.0" +description = "Pygments is a syntax highlighting package written in Python." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pygments-2.18.0-py3-none-any.whl", hash = "sha256:b8e6aca0523f3ab76fee51799c488e38782ac06eafcf95e7ba832985c8e7b13a"}, + {file = "pygments-2.18.0.tar.gz", hash = "sha256:786ff802f32e91311bff3889f6e9a86e81505fe99f2735bb6d60ae0c5004f199"}, +] + +[package.extras] +windows-terminal = ["colorama (>=0.4.6)"] + +[[package]] +name = "pymdown-extensions" +version = "10.11.2" +description = "Extension pack for Python Markdown." +optional = false +python-versions = ">=3.8" +files = [ + {file = "pymdown_extensions-10.11.2-py3-none-any.whl", hash = "sha256:41cdde0a77290e480cf53892f5c5e50921a7ee3e5cd60ba91bf19837b33badcf"}, + {file = "pymdown_extensions-10.11.2.tar.gz", hash = "sha256:bc8847ecc9e784a098efd35e20cba772bc5a1b529dfcef9dc1972db9021a1049"}, +] + +[package.dependencies] +markdown = ">=3.6" +pyyaml = "*" + +[package.extras] +extra = ["pygments (>=2.12)"] + +[[package]] +name = "pyparsing" +version = "3.2.0" +description = "pyparsing module - Classes and methods to define and execute parsing grammars" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pyparsing-3.2.0-py3-none-any.whl", hash = "sha256:93d9577b88da0bbea8cc8334ee8b918ed014968fd2ec383e868fb8afb1ccef84"}, + {file = "pyparsing-3.2.0.tar.gz", hash = "sha256:cbf74e27246d595d9a74b186b810f6fbb86726dbf3b9532efb343f6d7294fe9c"}, +] + +[package.extras] +diagrams = ["jinja2", "railroad-diagrams"] + +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + +[[package]] +name = "pytz" +version = "2024.2" +description = "World timezone definitions, modern and historical" +optional = false +python-versions = "*" +files = [ + {file = "pytz-2024.2-py2.py3-none-any.whl", hash = "sha256:31c7c1817eb7fae7ca4b8c7ee50c72f93aa2dd863de768e1ef4245d426aa0725"}, + {file = "pytz-2024.2.tar.gz", hash = "sha256:2aa355083c50a0f93fa581709deac0c9ad65cca8a9e9beac660adcbd493c798a"}, +] + +[[package]] +name = "pywin32" +version = "308" +description = "Python for Window Extensions" +optional = false +python-versions = "*" +files = [ + {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"}, + {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"}, + {file = "pywin32-308-cp310-cp310-win_arm64.whl", hash = "sha256:a5ab5381813b40f264fa3495b98af850098f814a25a63589a8e9eb12560f450c"}, + {file = "pywin32-308-cp311-cp311-win32.whl", hash = "sha256:5d8c8015b24a7d6855b1550d8e660d8daa09983c80e5daf89a273e5c6fb5095a"}, + {file = "pywin32-308-cp311-cp311-win_amd64.whl", hash = "sha256:575621b90f0dc2695fec346b2d6302faebd4f0f45c05ea29404cefe35d89442b"}, + {file = "pywin32-308-cp311-cp311-win_arm64.whl", hash = "sha256:100a5442b7332070983c4cd03f2e906a5648a5104b8a7f50175f7906efd16bb6"}, + {file = "pywin32-308-cp312-cp312-win32.whl", hash = "sha256:587f3e19696f4bf96fde9d8a57cec74a57021ad5f204c9e627e15c33ff568897"}, + {file = "pywin32-308-cp312-cp312-win_amd64.whl", hash = "sha256:00b3e11ef09ede56c6a43c71f2d31857cf7c54b0ab6e78ac659497abd2834f47"}, + {file = "pywin32-308-cp312-cp312-win_arm64.whl", hash = "sha256:9b4de86c8d909aed15b7011182c8cab38c8850de36e6afb1f0db22b8959e3091"}, + {file = "pywin32-308-cp313-cp313-win32.whl", hash = "sha256:1c44539a37a5b7b21d02ab34e6a4d314e0788f1690d65b48e9b0b89f31abbbed"}, + {file = "pywin32-308-cp313-cp313-win_amd64.whl", hash = "sha256:fd380990e792eaf6827fcb7e187b2b4b1cede0585e3d0c9e84201ec27b9905e4"}, + {file = "pywin32-308-cp313-cp313-win_arm64.whl", hash = "sha256:ef313c46d4c18dfb82a2431e3051ac8f112ccee1a34f29c263c583c568db63cd"}, + {file = "pywin32-308-cp37-cp37m-win32.whl", hash = "sha256:1f696ab352a2ddd63bd07430080dd598e6369152ea13a25ebcdd2f503a38f1ff"}, + {file = "pywin32-308-cp37-cp37m-win_amd64.whl", hash = "sha256:13dcb914ed4347019fbec6697a01a0aec61019c1046c2b905410d197856326a6"}, + {file = "pywin32-308-cp38-cp38-win32.whl", hash = "sha256:5794e764ebcabf4ff08c555b31bd348c9025929371763b2183172ff4708152f0"}, + {file = "pywin32-308-cp38-cp38-win_amd64.whl", hash = "sha256:3b92622e29d651c6b783e368ba7d6722b1634b8e70bd376fd7610fe1992e19de"}, + {file = "pywin32-308-cp39-cp39-win32.whl", hash = "sha256:7873ca4dc60ab3287919881a7d4f88baee4a6e639aa6962de25a98ba6b193341"}, + {file = "pywin32-308-cp39-cp39-win_amd64.whl", hash = "sha256:71b3322d949b4cc20776436a9c9ba0eeedcbc9c650daa536df63f0ff111bb920"}, +] + +[[package]] +name = "pyyaml" +version = "6.0.2" +description = "YAML parser and emitter for Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"}, + {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"}, + {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"}, + {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"}, + {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"}, + {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"}, + {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"}, + {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"}, + {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"}, + {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"}, + {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"}, + {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"}, + {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"}, + {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"}, + {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"}, + {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"}, + {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"}, + {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"}, + {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"}, + {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"}, + {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"}, + {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"}, + {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"}, + {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"}, + {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"}, + {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"}, + {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"}, + {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"}, + {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"}, + {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"}, + {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"}, + {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"}, + {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"}, + {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"}, + {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"}, + {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"}, + {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"}, + {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"}, + {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"}, + {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"}, + {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"}, + {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"}, + {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"}, + {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"}, + {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"}, + {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"}, + {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"}, + {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"}, + {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"}, + {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"}, + {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"}, + {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"}, + {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"}, +] + +[[package]] +name = "pyyaml-env-tag" +version = "0.1" +description = "A custom YAML tag for referencing environment variables in YAML files. " +optional = false +python-versions = ">=3.6" +files = [ + {file = "pyyaml_env_tag-0.1-py3-none-any.whl", hash = "sha256:af31106dec8a4d68c60207c1886031cbf839b68aa7abccdb19868200532c2069"}, + {file = "pyyaml_env_tag-0.1.tar.gz", hash = "sha256:70092675bda14fdec33b31ba77e7543de9ddc88f2e5b99160396572d11525bdb"}, +] + +[package.dependencies] +pyyaml = "*" + +[[package]] +name = "pyzmq" +version = "26.2.0" +description = "Python bindings for 0MQ" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pyzmq-26.2.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:ddf33d97d2f52d89f6e6e7ae66ee35a4d9ca6f36eda89c24591b0c40205a3629"}, + {file = "pyzmq-26.2.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:dacd995031a01d16eec825bf30802fceb2c3791ef24bcce48fa98ce40918c27b"}, + {file = "pyzmq-26.2.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89289a5ee32ef6c439086184529ae060c741334b8970a6855ec0b6ad3ff28764"}, + {file = "pyzmq-26.2.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5506f06d7dc6ecf1efacb4a013b1f05071bb24b76350832c96449f4a2d95091c"}, + {file = "pyzmq-26.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8ea039387c10202ce304af74def5021e9adc6297067f3441d348d2b633e8166a"}, + {file = "pyzmq-26.2.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a2224fa4a4c2ee872886ed00a571f5e967c85e078e8e8c2530a2fb01b3309b88"}, + {file = "pyzmq-26.2.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:28ad5233e9c3b52d76196c696e362508959741e1a005fb8fa03b51aea156088f"}, + {file = "pyzmq-26.2.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:1c17211bc037c7d88e85ed8b7d8f7e52db6dc8eca5590d162717c654550f7282"}, + {file = "pyzmq-26.2.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:b8f86dd868d41bea9a5f873ee13bf5551c94cf6bc51baebc6f85075971fe6eea"}, + {file = "pyzmq-26.2.0-cp310-cp310-win32.whl", hash = "sha256:46a446c212e58456b23af260f3d9fb785054f3e3653dbf7279d8f2b5546b21c2"}, + {file = "pyzmq-26.2.0-cp310-cp310-win_amd64.whl", hash = "sha256:49d34ab71db5a9c292a7644ce74190b1dd5a3475612eefb1f8be1d6961441971"}, + {file = "pyzmq-26.2.0-cp310-cp310-win_arm64.whl", hash = "sha256:bfa832bfa540e5b5c27dcf5de5d82ebc431b82c453a43d141afb1e5d2de025fa"}, + {file = "pyzmq-26.2.0-cp311-cp311-macosx_10_15_universal2.whl", hash = "sha256:8f7e66c7113c684c2b3f1c83cdd3376103ee0ce4c49ff80a648643e57fb22218"}, + {file = "pyzmq-26.2.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3a495b30fc91db2db25120df5847d9833af237546fd59170701acd816ccc01c4"}, + {file = "pyzmq-26.2.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77eb0968da535cba0470a5165468b2cac7772cfb569977cff92e240f57e31bef"}, + {file = "pyzmq-26.2.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6ace4f71f1900a548f48407fc9be59c6ba9d9aaf658c2eea6cf2779e72f9f317"}, + {file = "pyzmq-26.2.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:92a78853d7280bffb93df0a4a6a2498cba10ee793cc8076ef797ef2f74d107cf"}, + {file = "pyzmq-26.2.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:689c5d781014956a4a6de61d74ba97b23547e431e9e7d64f27d4922ba96e9d6e"}, + {file = "pyzmq-26.2.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:0aca98bc423eb7d153214b2df397c6421ba6373d3397b26c057af3c904452e37"}, + {file = "pyzmq-26.2.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:1f3496d76b89d9429a656293744ceca4d2ac2a10ae59b84c1da9b5165f429ad3"}, + {file = "pyzmq-26.2.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5c2b3bfd4b9689919db068ac6c9911f3fcb231c39f7dd30e3138be94896d18e6"}, + {file = "pyzmq-26.2.0-cp311-cp311-win32.whl", hash = "sha256:eac5174677da084abf378739dbf4ad245661635f1600edd1221f150b165343f4"}, + {file = "pyzmq-26.2.0-cp311-cp311-win_amd64.whl", hash = "sha256:5a509df7d0a83a4b178d0f937ef14286659225ef4e8812e05580776c70e155d5"}, + {file = "pyzmq-26.2.0-cp311-cp311-win_arm64.whl", hash = "sha256:c0e6091b157d48cbe37bd67233318dbb53e1e6327d6fc3bb284afd585d141003"}, + {file = "pyzmq-26.2.0-cp312-cp312-macosx_10_15_universal2.whl", hash = "sha256:ded0fc7d90fe93ae0b18059930086c51e640cdd3baebdc783a695c77f123dcd9"}, + {file = "pyzmq-26.2.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:17bf5a931c7f6618023cdacc7081f3f266aecb68ca692adac015c383a134ca52"}, + {file = "pyzmq-26.2.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:55cf66647e49d4621a7e20c8d13511ef1fe1efbbccf670811864452487007e08"}, + {file = "pyzmq-26.2.0-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4661c88db4a9e0f958c8abc2b97472e23061f0bc737f6f6179d7a27024e1faa5"}, + {file = "pyzmq-26.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea7f69de383cb47522c9c208aec6dd17697db7875a4674c4af3f8cfdac0bdeae"}, + {file = "pyzmq-26.2.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7f98f6dfa8b8ccaf39163ce872bddacca38f6a67289116c8937a02e30bbe9711"}, + {file = "pyzmq-26.2.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:e3e0210287329272539eea617830a6a28161fbbd8a3271bf4150ae3e58c5d0e6"}, + {file = "pyzmq-26.2.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6b274e0762c33c7471f1a7471d1a2085b1a35eba5cdc48d2ae319f28b6fc4de3"}, + {file = "pyzmq-26.2.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:29c6a4635eef69d68a00321e12a7d2559fe2dfccfa8efae3ffb8e91cd0b36a8b"}, + {file = "pyzmq-26.2.0-cp312-cp312-win32.whl", hash = "sha256:989d842dc06dc59feea09e58c74ca3e1678c812a4a8a2a419046d711031f69c7"}, + {file = "pyzmq-26.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:2a50625acdc7801bc6f74698c5c583a491c61d73c6b7ea4dee3901bb99adb27a"}, + {file = "pyzmq-26.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:4d29ab8592b6ad12ebbf92ac2ed2bedcfd1cec192d8e559e2e099f648570e19b"}, + {file = "pyzmq-26.2.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9dd8cd1aeb00775f527ec60022004d030ddc51d783d056e3e23e74e623e33726"}, + {file = "pyzmq-26.2.0-cp313-cp313-macosx_10_15_universal2.whl", hash = "sha256:28c812d9757fe8acecc910c9ac9dafd2ce968c00f9e619db09e9f8f54c3a68a3"}, + {file = "pyzmq-26.2.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4d80b1dd99c1942f74ed608ddb38b181b87476c6a966a88a950c7dee118fdf50"}, + {file = "pyzmq-26.2.0-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8c997098cc65e3208eca09303630e84d42718620e83b733d0fd69543a9cab9cb"}, + {file = "pyzmq-26.2.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7ad1bc8d1b7a18497dda9600b12dc193c577beb391beae5cd2349184db40f187"}, + {file = "pyzmq-26.2.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:bea2acdd8ea4275e1278350ced63da0b166421928276c7c8e3f9729d7402a57b"}, + {file = "pyzmq-26.2.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:23f4aad749d13698f3f7b64aad34f5fc02d6f20f05999eebc96b89b01262fb18"}, + {file = "pyzmq-26.2.0-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:a4f96f0d88accc3dbe4a9025f785ba830f968e21e3e2c6321ccdfc9aef755115"}, + {file = "pyzmq-26.2.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ced65e5a985398827cc9276b93ef6dfabe0273c23de8c7931339d7e141c2818e"}, + {file = "pyzmq-26.2.0-cp313-cp313-win32.whl", hash = "sha256:31507f7b47cc1ead1f6e86927f8ebb196a0bab043f6345ce070f412a59bf87b5"}, + {file = "pyzmq-26.2.0-cp313-cp313-win_amd64.whl", hash = "sha256:70fc7fcf0410d16ebdda9b26cbd8bf8d803d220a7f3522e060a69a9c87bf7bad"}, + {file = "pyzmq-26.2.0-cp313-cp313-win_arm64.whl", hash = "sha256:c3789bd5768ab5618ebf09cef6ec2b35fed88709b104351748a63045f0ff9797"}, + {file = "pyzmq-26.2.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:034da5fc55d9f8da09015d368f519478a52675e558c989bfcb5cf6d4e16a7d2a"}, + {file = "pyzmq-26.2.0-cp313-cp313t-macosx_10_15_universal2.whl", hash = "sha256:c92d73464b886931308ccc45b2744e5968cbaade0b1d6aeb40d8ab537765f5bc"}, + {file = "pyzmq-26.2.0-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:794a4562dcb374f7dbbfb3f51d28fb40123b5a2abadee7b4091f93054909add5"}, + {file = "pyzmq-26.2.0-cp313-cp313t-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aee22939bb6075e7afededabad1a56a905da0b3c4e3e0c45e75810ebe3a52672"}, + {file = "pyzmq-26.2.0-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ae90ff9dad33a1cfe947d2c40cb9cb5e600d759ac4f0fd22616ce6540f72797"}, + {file = "pyzmq-26.2.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:43a47408ac52647dfabbc66a25b05b6a61700b5165807e3fbd40063fcaf46386"}, + {file = "pyzmq-26.2.0-cp313-cp313t-musllinux_1_1_aarch64.whl", hash = "sha256:25bf2374a2a8433633c65ccb9553350d5e17e60c8eb4de4d92cc6bd60f01d306"}, + {file = "pyzmq-26.2.0-cp313-cp313t-musllinux_1_1_i686.whl", hash = "sha256:007137c9ac9ad5ea21e6ad97d3489af654381324d5d3ba614c323f60dab8fae6"}, + {file = "pyzmq-26.2.0-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:470d4a4f6d48fb34e92d768b4e8a5cc3780db0d69107abf1cd7ff734b9766eb0"}, + {file = "pyzmq-26.2.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:3b55a4229ce5da9497dd0452b914556ae58e96a4381bb6f59f1305dfd7e53fc8"}, + {file = "pyzmq-26.2.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:9cb3a6460cdea8fe8194a76de8895707e61ded10ad0be97188cc8463ffa7e3a8"}, + {file = "pyzmq-26.2.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8ab5cad923cc95c87bffee098a27856c859bd5d0af31bd346035aa816b081fe1"}, + {file = "pyzmq-26.2.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9ed69074a610fad1c2fda66180e7b2edd4d31c53f2d1872bc2d1211563904cd9"}, + {file = "pyzmq-26.2.0-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:cccba051221b916a4f5e538997c45d7d136a5646442b1231b916d0164067ea27"}, + {file = "pyzmq-26.2.0-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:0eaa83fc4c1e271c24eaf8fb083cbccef8fde77ec8cd45f3c35a9a123e6da097"}, + {file = "pyzmq-26.2.0-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:9edda2df81daa129b25a39b86cb57dfdfe16f7ec15b42b19bfac503360d27a93"}, + {file = "pyzmq-26.2.0-cp37-cp37m-win32.whl", hash = "sha256:ea0eb6af8a17fa272f7b98d7bebfab7836a0d62738e16ba380f440fceca2d951"}, + {file = "pyzmq-26.2.0-cp37-cp37m-win_amd64.whl", hash = "sha256:4ff9dc6bc1664bb9eec25cd17506ef6672d506115095411e237d571e92a58231"}, + {file = "pyzmq-26.2.0-cp38-cp38-macosx_10_15_universal2.whl", hash = "sha256:2eb7735ee73ca1b0d71e0e67c3739c689067f055c764f73aac4cc8ecf958ee3f"}, + {file = "pyzmq-26.2.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1a534f43bc738181aa7cbbaf48e3eca62c76453a40a746ab95d4b27b1111a7d2"}, + {file = "pyzmq-26.2.0-cp38-cp38-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:aedd5dd8692635813368e558a05266b995d3d020b23e49581ddd5bbe197a8ab6"}, + {file = "pyzmq-26.2.0-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:8be4700cd8bb02cc454f630dcdf7cfa99de96788b80c51b60fe2fe1dac480289"}, + {file = "pyzmq-26.2.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fcc03fa4997c447dce58264e93b5aa2d57714fbe0f06c07b7785ae131512732"}, + {file = "pyzmq-26.2.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:402b190912935d3db15b03e8f7485812db350d271b284ded2b80d2e5704be780"}, + {file = "pyzmq-26.2.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:8685fa9c25ff00f550c1fec650430c4b71e4e48e8d852f7ddcf2e48308038640"}, + {file = "pyzmq-26.2.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:76589c020680778f06b7e0b193f4b6dd66d470234a16e1df90329f5e14a171cd"}, + {file = "pyzmq-26.2.0-cp38-cp38-win32.whl", hash = "sha256:8423c1877d72c041f2c263b1ec6e34360448decfb323fa8b94e85883043ef988"}, + {file = "pyzmq-26.2.0-cp38-cp38-win_amd64.whl", hash = "sha256:76589f2cd6b77b5bdea4fca5992dc1c23389d68b18ccc26a53680ba2dc80ff2f"}, + {file = "pyzmq-26.2.0-cp39-cp39-macosx_10_15_universal2.whl", hash = "sha256:b1d464cb8d72bfc1a3adc53305a63a8e0cac6bc8c5a07e8ca190ab8d3faa43c2"}, + {file = "pyzmq-26.2.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4da04c48873a6abdd71811c5e163bd656ee1b957971db7f35140a2d573f6949c"}, + {file = "pyzmq-26.2.0-cp39-cp39-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:d049df610ac811dcffdc147153b414147428567fbbc8be43bb8885f04db39d98"}, + {file = "pyzmq-26.2.0-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:05590cdbc6b902101d0e65d6a4780af14dc22914cc6ab995d99b85af45362cc9"}, + {file = "pyzmq-26.2.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c811cfcd6a9bf680236c40c6f617187515269ab2912f3d7e8c0174898e2519db"}, + {file = "pyzmq-26.2.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:6835dd60355593de10350394242b5757fbbd88b25287314316f266e24c61d073"}, + {file = "pyzmq-26.2.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bc6bee759a6bddea5db78d7dcd609397449cb2d2d6587f48f3ca613b19410cfc"}, + {file = "pyzmq-26.2.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:c530e1eecd036ecc83c3407f77bb86feb79916d4a33d11394b8234f3bd35b940"}, + {file = "pyzmq-26.2.0-cp39-cp39-win32.whl", hash = "sha256:367b4f689786fca726ef7a6c5ba606958b145b9340a5e4808132cc65759abd44"}, + {file = "pyzmq-26.2.0-cp39-cp39-win_amd64.whl", hash = "sha256:e6fa2e3e683f34aea77de8112f6483803c96a44fd726d7358b9888ae5bb394ec"}, + {file = "pyzmq-26.2.0-cp39-cp39-win_arm64.whl", hash = "sha256:7445be39143a8aa4faec43b076e06944b8f9d0701b669df4af200531b21e40bb"}, + {file = "pyzmq-26.2.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:706e794564bec25819d21a41c31d4df2d48e1cc4b061e8d345d7fb4dd3e94072"}, + {file = "pyzmq-26.2.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b435f2753621cd36e7c1762156815e21c985c72b19135dac43a7f4f31d28dd1"}, + {file = "pyzmq-26.2.0-pp310-pypy310_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:160c7e0a5eb178011e72892f99f918c04a131f36056d10d9c1afb223fc952c2d"}, + {file = "pyzmq-26.2.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c4a71d5d6e7b28a47a394c0471b7e77a0661e2d651e7ae91e0cab0a587859ca"}, + {file = "pyzmq-26.2.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:90412f2db8c02a3864cbfc67db0e3dcdbda336acf1c469526d3e869394fe001c"}, + {file = "pyzmq-26.2.0-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2ea4ad4e6a12e454de05f2949d4beddb52460f3de7c8b9d5c46fbb7d7222e02c"}, + {file = "pyzmq-26.2.0-pp37-pypy37_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fc4f7a173a5609631bb0c42c23d12c49df3966f89f496a51d3eb0ec81f4519d6"}, + {file = "pyzmq-26.2.0-pp37-pypy37_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:878206a45202247781472a2d99df12a176fef806ca175799e1c6ad263510d57c"}, + {file = "pyzmq-26.2.0-pp37-pypy37_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:17c412bad2eb9468e876f556eb4ee910e62d721d2c7a53c7fa31e643d35352e6"}, + {file = "pyzmq-26.2.0-pp37-pypy37_pp73-win_amd64.whl", hash = "sha256:0d987a3ae5a71c6226b203cfd298720e0086c7fe7c74f35fa8edddfbd6597eed"}, + {file = "pyzmq-26.2.0-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:39887ac397ff35b7b775db7201095fc6310a35fdbae85bac4523f7eb3b840e20"}, + {file = "pyzmq-26.2.0-pp38-pypy38_pp73-manylinux_2_12_i686.manylinux2010_i686.whl", hash = "sha256:fdb5b3e311d4d4b0eb8b3e8b4d1b0a512713ad7e6a68791d0923d1aec433d919"}, + {file = "pyzmq-26.2.0-pp38-pypy38_pp73-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:226af7dcb51fdb0109f0016449b357e182ea0ceb6b47dfb5999d569e5db161d5"}, + {file = "pyzmq-26.2.0-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0bed0e799e6120b9c32756203fb9dfe8ca2fb8467fed830c34c877e25638c3fc"}, + {file = "pyzmq-26.2.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:29c7947c594e105cb9e6c466bace8532dc1ca02d498684128b339799f5248277"}, + {file = "pyzmq-26.2.0-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:cdeabcff45d1c219636ee2e54d852262e5c2e085d6cb476d938aee8d921356b3"}, + {file = "pyzmq-26.2.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:35cffef589bcdc587d06f9149f8d5e9e8859920a071df5a2671de2213bef592a"}, + {file = "pyzmq-26.2.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18c8dc3b7468d8b4bdf60ce9d7141897da103c7a4690157b32b60acb45e333e6"}, + {file = "pyzmq-26.2.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7133d0a1677aec369d67dd78520d3fa96dd7f3dcec99d66c1762870e5ea1a50a"}, + {file = "pyzmq-26.2.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:6a96179a24b14fa6428cbfc08641c779a53f8fcec43644030328f44034c7f1f4"}, + {file = "pyzmq-26.2.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:4f78c88905461a9203eac9faac157a2a0dbba84a0fd09fd29315db27be40af9f"}, + {file = "pyzmq-26.2.0.tar.gz", hash = "sha256:070672c258581c8e4f640b5159297580a9974b026043bd4ab0470be9ed324f1f"}, +] + +[package.dependencies] +cffi = {version = "*", markers = "implementation_name == \"pypy\""} + +[[package]] +name = "referencing" +version = "0.35.1" +description = "JSON Referencing + Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "referencing-0.35.1-py3-none-any.whl", hash = "sha256:eda6d3234d62814d1c64e305c1331c9a3a6132da475ab6382eaa997b21ee75de"}, + {file = "referencing-0.35.1.tar.gz", hash = "sha256:25b42124a6c8b632a425174f24087783efb348a6f1e0008e63cd4466fedf703c"}, +] + +[package.dependencies] +attrs = ">=22.2.0" +rpds-py = ">=0.7.0" + +[[package]] +name = "regex" +version = "2024.9.11" +description = "Alternative regular expression module, to replace re." +optional = false +python-versions = ">=3.8" +files = [ + {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:1494fa8725c285a81d01dc8c06b55287a1ee5e0e382d8413adc0a9197aac6408"}, + {file = "regex-2024.9.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0e12c481ad92d129c78f13a2a3662317e46ee7ef96c94fd332e1c29131875b7d"}, + {file = "regex-2024.9.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:16e13a7929791ac1216afde26f712802e3df7bf0360b32e4914dca3ab8baeea5"}, + {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:46989629904bad940bbec2106528140a218b4a36bb3042d8406980be1941429c"}, + {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a906ed5e47a0ce5f04b2c981af1c9acf9e8696066900bf03b9d7879a6f679fc8"}, + {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e9a091b0550b3b0207784a7d6d0f1a00d1d1c8a11699c1a4d93db3fbefc3ad35"}, + {file = "regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ddcd9a179c0a6fa8add279a4444015acddcd7f232a49071ae57fa6e278f1f71"}, + {file = "regex-2024.9.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6b41e1adc61fa347662b09398e31ad446afadff932a24807d3ceb955ed865cc8"}, + {file = "regex-2024.9.11-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:ced479f601cd2f8ca1fd7b23925a7e0ad512a56d6e9476f79b8f381d9d37090a"}, + {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:635a1d96665f84b292e401c3d62775851aedc31d4f8784117b3c68c4fcd4118d"}, + {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:c0256beda696edcf7d97ef16b2a33a8e5a875affd6fa6567b54f7c577b30a137"}, + {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:3ce4f1185db3fbde8ed8aa223fc9620f276c58de8b0d4f8cc86fd1360829edb6"}, + {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:09d77559e80dcc9d24570da3745ab859a9cf91953062e4ab126ba9d5993688ca"}, + {file = "regex-2024.9.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:7a22ccefd4db3f12b526eccb129390942fe874a3a9fdbdd24cf55773a1faab1a"}, + {file = "regex-2024.9.11-cp310-cp310-win32.whl", hash = "sha256:f745ec09bc1b0bd15cfc73df6fa4f726dcc26bb16c23a03f9e3367d357eeedd0"}, + {file = "regex-2024.9.11-cp310-cp310-win_amd64.whl", hash = "sha256:01c2acb51f8a7d6494c8c5eafe3d8e06d76563d8a8a4643b37e9b2dd8a2ff623"}, + {file = "regex-2024.9.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:2cce2449e5927a0bf084d346da6cd5eb016b2beca10d0013ab50e3c226ffc0df"}, + {file = "regex-2024.9.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:3b37fa423beefa44919e009745ccbf353d8c981516e807995b2bd11c2c77d268"}, + {file = "regex-2024.9.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:64ce2799bd75039b480cc0360907c4fb2f50022f030bf9e7a8705b636e408fad"}, + {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a4cc92bb6db56ab0c1cbd17294e14f5e9224f0cc6521167ef388332604e92679"}, + {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d05ac6fa06959c4172eccd99a222e1fbf17b5670c4d596cb1e5cde99600674c4"}, + {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:040562757795eeea356394a7fb13076ad4f99d3c62ab0f8bdfb21f99a1f85664"}, + {file = "regex-2024.9.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6113c008a7780792efc80f9dfe10ba0cd043cbf8dc9a76ef757850f51b4edc50"}, + {file = "regex-2024.9.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8e5fb5f77c8745a60105403a774fe2c1759b71d3e7b4ca237a5e67ad066c7199"}, + {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:54d9ff35d4515debf14bc27f1e3b38bfc453eff3220f5bce159642fa762fe5d4"}, + {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:df5cbb1fbc74a8305b6065d4ade43b993be03dbe0f8b30032cced0d7740994bd"}, + {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:7fb89ee5d106e4a7a51bce305ac4efb981536301895f7bdcf93ec92ae0d91c7f"}, + {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:a738b937d512b30bf75995c0159c0ddf9eec0775c9d72ac0202076c72f24aa96"}, + {file = "regex-2024.9.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e28f9faeb14b6f23ac55bfbbfd3643f5c7c18ede093977f1df249f73fd22c7b1"}, + {file = "regex-2024.9.11-cp311-cp311-win32.whl", hash = "sha256:18e707ce6c92d7282dfce370cd205098384b8ee21544e7cb29b8aab955b66fa9"}, + {file = "regex-2024.9.11-cp311-cp311-win_amd64.whl", hash = "sha256:313ea15e5ff2a8cbbad96ccef6be638393041b0a7863183c2d31e0c6116688cf"}, + {file = "regex-2024.9.11-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:b0d0a6c64fcc4ef9c69bd5b3b3626cc3776520a1637d8abaa62b9edc147a58f7"}, + {file = "regex-2024.9.11-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:49b0e06786ea663f933f3710a51e9385ce0cba0ea56b67107fd841a55d56a231"}, + {file = "regex-2024.9.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5b513b6997a0b2f10e4fd3a1313568e373926e8c252bd76c960f96fd039cd28d"}, + {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee439691d8c23e76f9802c42a95cfeebf9d47cf4ffd06f18489122dbb0a7ad64"}, + {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a8f877c89719d759e52783f7fe6e1c67121076b87b40542966c02de5503ace42"}, + {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:23b30c62d0f16827f2ae9f2bb87619bc4fba2044911e2e6c2eb1af0161cdb766"}, + {file = "regex-2024.9.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85ab7824093d8f10d44330fe1e6493f756f252d145323dd17ab6b48733ff6c0a"}, + {file = "regex-2024.9.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8dee5b4810a89447151999428fe096977346cf2f29f4d5e29609d2e19e0199c9"}, + {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:98eeee2f2e63edae2181c886d7911ce502e1292794f4c5ee71e60e23e8d26b5d"}, + {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:57fdd2e0b2694ce6fc2e5ccf189789c3e2962916fb38779d3e3521ff8fe7a822"}, + {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:d552c78411f60b1fdaafd117a1fca2f02e562e309223b9d44b7de8be451ec5e0"}, + {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:a0b2b80321c2ed3fcf0385ec9e51a12253c50f146fddb2abbb10f033fe3d049a"}, + {file = "regex-2024.9.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:18406efb2f5a0e57e3a5881cd9354c1512d3bb4f5c45d96d110a66114d84d23a"}, + {file = "regex-2024.9.11-cp312-cp312-win32.whl", hash = "sha256:e464b467f1588e2c42d26814231edecbcfe77f5ac414d92cbf4e7b55b2c2a776"}, + {file = "regex-2024.9.11-cp312-cp312-win_amd64.whl", hash = "sha256:9e8719792ca63c6b8340380352c24dcb8cd7ec49dae36e963742a275dfae6009"}, + {file = "regex-2024.9.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:c157bb447303070f256e084668b702073db99bbb61d44f85d811025fcf38f784"}, + {file = "regex-2024.9.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4db21ece84dfeefc5d8a3863f101995de646c6cb0536952c321a2650aa202c36"}, + {file = "regex-2024.9.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:220e92a30b426daf23bb67a7962900ed4613589bab80382be09b48896d211e92"}, + {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eb1ae19e64c14c7ec1995f40bd932448713d3c73509e82d8cd7744dc00e29e86"}, + {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f47cd43a5bfa48f86925fe26fbdd0a488ff15b62468abb5d2a1e092a4fb10e85"}, + {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9d4a76b96f398697fe01117093613166e6aa8195d63f1b4ec3f21ab637632963"}, + {file = "regex-2024.9.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ea51dcc0835eea2ea31d66456210a4e01a076d820e9039b04ae8d17ac11dee6"}, + {file = "regex-2024.9.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b7aaa315101c6567a9a45d2839322c51c8d6e81f67683d529512f5bcfb99c802"}, + {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c57d08ad67aba97af57a7263c2d9006d5c404d721c5f7542f077f109ec2a4a29"}, + {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:f8404bf61298bb6f8224bb9176c1424548ee1181130818fcd2cbffddc768bed8"}, + {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:dd4490a33eb909ef5078ab20f5f000087afa2a4daa27b4c072ccb3cb3050ad84"}, + {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:eee9130eaad130649fd73e5cd92f60e55708952260ede70da64de420cdcad554"}, + {file = "regex-2024.9.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6a2644a93da36c784e546de579ec1806bfd2763ef47babc1b03d765fe560c9f8"}, + {file = "regex-2024.9.11-cp313-cp313-win32.whl", hash = "sha256:e997fd30430c57138adc06bba4c7c2968fb13d101e57dd5bb9355bf8ce3fa7e8"}, + {file = "regex-2024.9.11-cp313-cp313-win_amd64.whl", hash = "sha256:042c55879cfeb21a8adacc84ea347721d3d83a159da6acdf1116859e2427c43f"}, + {file = "regex-2024.9.11-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:35f4a6f96aa6cb3f2f7247027b07b15a374f0d5b912c0001418d1d55024d5cb4"}, + {file = "regex-2024.9.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:55b96e7ce3a69a8449a66984c268062fbaa0d8ae437b285428e12797baefce7e"}, + {file = "regex-2024.9.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:cb130fccd1a37ed894824b8c046321540263013da72745d755f2d35114b81a60"}, + {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:323c1f04be6b2968944d730e5c2091c8c89767903ecaa135203eec4565ed2b2b"}, + {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:be1c8ed48c4c4065ecb19d882a0ce1afe0745dfad8ce48c49586b90a55f02366"}, + {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b5b029322e6e7b94fff16cd120ab35a253236a5f99a79fb04fda7ae71ca20ae8"}, + {file = "regex-2024.9.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f6fff13ef6b5f29221d6904aa816c34701462956aa72a77f1f151a8ec4f56aeb"}, + {file = "regex-2024.9.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:587d4af3979376652010e400accc30404e6c16b7df574048ab1f581af82065e4"}, + {file = "regex-2024.9.11-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:079400a8269544b955ffa9e31f186f01d96829110a3bf79dc338e9910f794fca"}, + {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:f9268774428ec173654985ce55fc6caf4c6d11ade0f6f914d48ef4719eb05ebb"}, + {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:23f9985c8784e544d53fc2930fc1ac1a7319f5d5332d228437acc9f418f2f168"}, + {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:ae2941333154baff9838e88aa71c1d84f4438189ecc6021a12c7573728b5838e"}, + {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:e93f1c331ca8e86fe877a48ad64e77882c0c4da0097f2212873a69bbfea95d0c"}, + {file = "regex-2024.9.11-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:846bc79ee753acf93aef4184c040d709940c9d001029ceb7b7a52747b80ed2dd"}, + {file = "regex-2024.9.11-cp38-cp38-win32.whl", hash = "sha256:c94bb0a9f1db10a1d16c00880bdebd5f9faf267273b8f5bd1878126e0fbde771"}, + {file = "regex-2024.9.11-cp38-cp38-win_amd64.whl", hash = "sha256:2b08fce89fbd45664d3df6ad93e554b6c16933ffa9d55cb7e01182baaf971508"}, + {file = "regex-2024.9.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:07f45f287469039ffc2c53caf6803cd506eb5f5f637f1d4acb37a738f71dd066"}, + {file = "regex-2024.9.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4838e24ee015101d9f901988001038f7f0d90dc0c3b115541a1365fb439add62"}, + {file = "regex-2024.9.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:6edd623bae6a737f10ce853ea076f56f507fd7726bee96a41ee3d68d347e4d16"}, + {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c69ada171c2d0e97a4b5aa78fbb835e0ffbb6b13fc5da968c09811346564f0d3"}, + {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:02087ea0a03b4af1ed6ebab2c54d7118127fee8d71b26398e8e4b05b78963199"}, + {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:69dee6a020693d12a3cf892aba4808fe168d2a4cef368eb9bf74f5398bfd4ee8"}, + {file = "regex-2024.9.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:297f54910247508e6e5cae669f2bc308985c60540a4edd1c77203ef19bfa63ca"}, + {file = "regex-2024.9.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ecea58b43a67b1b79805f1a0255730edaf5191ecef84dbc4cc85eb30bc8b63b9"}, + {file = "regex-2024.9.11-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:eab4bb380f15e189d1313195b062a6aa908f5bd687a0ceccd47c8211e9cf0d4a"}, + {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0cbff728659ce4bbf4c30b2a1be040faafaa9eca6ecde40aaff86f7889f4ab39"}, + {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:54c4a097b8bc5bb0dfc83ae498061d53ad7b5762e00f4adaa23bee22b012e6ba"}, + {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:73d6d2f64f4d894c96626a75578b0bf7d9e56dcda8c3d037a2118fdfe9b1c664"}, + {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:e53b5fbab5d675aec9f0c501274c467c0f9a5d23696cfc94247e1fb56501ed89"}, + {file = "regex-2024.9.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:0ffbcf9221e04502fc35e54d1ce9567541979c3fdfb93d2c554f0ca583a19b35"}, + {file = "regex-2024.9.11-cp39-cp39-win32.whl", hash = "sha256:e4c22e1ac1f1ec1e09f72e6c44d8f2244173db7eb9629cc3a346a8d7ccc31142"}, + {file = "regex-2024.9.11-cp39-cp39-win_amd64.whl", hash = "sha256:faa3c142464efec496967359ca99696c896c591c56c53506bac1ad465f66e919"}, + {file = "regex-2024.9.11.tar.gz", hash = "sha256:6c188c307e8433bcb63dc1915022deb553b4203a70722fc542c363bf120a01fd"}, +] + +[[package]] +name = "requests" +version = "2.32.3" +description = "Python HTTP for Humans." +optional = false +python-versions = ">=3.8" +files = [ + {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, + {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, +] + +[package.dependencies] +certifi = ">=2017.4.17" +charset-normalizer = ">=2,<4" +idna = ">=2.5,<4" +urllib3 = ">=1.21.1,<3" + +[package.extras] +socks = ["PySocks (>=1.5.6,!=1.5.7)"] +use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] + +[[package]] +name = "rpds-py" +version = "0.20.0" +description = "Python bindings to Rust's persistent data structures (rpds)" +optional = false +python-versions = ">=3.8" +files = [ + {file = "rpds_py-0.20.0-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:3ad0fda1635f8439cde85c700f964b23ed5fc2d28016b32b9ee5fe30da5c84e2"}, + {file = "rpds_py-0.20.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:9bb4a0d90fdb03437c109a17eade42dfbf6190408f29b2744114d11586611d6f"}, + {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6377e647bbfd0a0b159fe557f2c6c602c159fc752fa316572f012fc0bf67150"}, + {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:eb851b7df9dda52dc1415ebee12362047ce771fc36914586b2e9fcbd7d293b3e"}, + {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e0f80b739e5a8f54837be5d5c924483996b603d5502bfff79bf33da06164ee2"}, + {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5a8c94dad2e45324fc74dce25e1645d4d14df9a4e54a30fa0ae8bad9a63928e3"}, + {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8e604fe73ba048c06085beaf51147eaec7df856824bfe7b98657cf436623daf"}, + {file = "rpds_py-0.20.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:df3de6b7726b52966edf29663e57306b23ef775faf0ac01a3e9f4012a24a4140"}, + {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cf258ede5bc22a45c8e726b29835b9303c285ab46fc7c3a4cc770736b5304c9f"}, + {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:55fea87029cded5df854ca7e192ec7bdb7ecd1d9a3f63d5c4eb09148acf4a7ce"}, + {file = "rpds_py-0.20.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ae94bd0b2f02c28e199e9bc51485d0c5601f58780636185660f86bf80c89af94"}, + {file = "rpds_py-0.20.0-cp310-none-win32.whl", hash = "sha256:28527c685f237c05445efec62426d285e47a58fb05ba0090a4340b73ecda6dee"}, + {file = "rpds_py-0.20.0-cp310-none-win_amd64.whl", hash = "sha256:238a2d5b1cad28cdc6ed15faf93a998336eb041c4e440dd7f902528b8891b399"}, + {file = "rpds_py-0.20.0-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:ac2f4f7a98934c2ed6505aead07b979e6f999389f16b714448fb39bbaa86a489"}, + {file = "rpds_py-0.20.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:220002c1b846db9afd83371d08d239fdc865e8f8c5795bbaec20916a76db3318"}, + {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8d7919548df3f25374a1f5d01fbcd38dacab338ef5f33e044744b5c36729c8db"}, + {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:758406267907b3781beee0f0edfe4a179fbd97c0be2e9b1154d7f0a1279cf8e5"}, + {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3d61339e9f84a3f0767b1995adfb171a0d00a1185192718a17af6e124728e0f5"}, + {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1259c7b3705ac0a0bd38197565a5d603218591d3f6cee6e614e380b6ba61c6f6"}, + {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5c1dc0f53856b9cc9a0ccca0a7cc61d3d20a7088201c0937f3f4048c1718a209"}, + {file = "rpds_py-0.20.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7e60cb630f674a31f0368ed32b2a6b4331b8350d67de53c0359992444b116dd3"}, + {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:dbe982f38565bb50cb7fb061ebf762c2f254ca3d8c20d4006878766e84266272"}, + {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:514b3293b64187172bc77c8fb0cdae26981618021053b30d8371c3a902d4d5ad"}, + {file = "rpds_py-0.20.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d0a26ffe9d4dd35e4dfdd1e71f46401cff0181c75ac174711ccff0459135fa58"}, + {file = "rpds_py-0.20.0-cp311-none-win32.whl", hash = "sha256:89c19a494bf3ad08c1da49445cc5d13d8fefc265f48ee7e7556839acdacf69d0"}, + {file = "rpds_py-0.20.0-cp311-none-win_amd64.whl", hash = "sha256:c638144ce971df84650d3ed0096e2ae7af8e62ecbbb7b201c8935c370df00a2c"}, + {file = "rpds_py-0.20.0-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:a84ab91cbe7aab97f7446652d0ed37d35b68a465aeef8fc41932a9d7eee2c1a6"}, + {file = "rpds_py-0.20.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:56e27147a5a4c2c21633ff8475d185734c0e4befd1c989b5b95a5d0db699b21b"}, + {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2580b0c34583b85efec8c5c5ec9edf2dfe817330cc882ee972ae650e7b5ef739"}, + {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b80d4a7900cf6b66bb9cee5c352b2d708e29e5a37fe9bf784fa97fc11504bf6c"}, + {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:50eccbf054e62a7b2209b28dc7a22d6254860209d6753e6b78cfaeb0075d7bee"}, + {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:49a8063ea4296b3a7e81a5dfb8f7b2d73f0b1c20c2af401fb0cdf22e14711a96"}, + {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ea438162a9fcbee3ecf36c23e6c68237479f89f962f82dae83dc15feeceb37e4"}, + {file = "rpds_py-0.20.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:18d7585c463087bddcfa74c2ba267339f14f2515158ac4db30b1f9cbdb62c8ef"}, + {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d4c7d1a051eeb39f5c9547e82ea27cbcc28338482242e3e0b7768033cb083821"}, + {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:e4df1e3b3bec320790f699890d41c59d250f6beda159ea3c44c3f5bac1976940"}, + {file = "rpds_py-0.20.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2cf126d33a91ee6eedc7f3197b53e87a2acdac63602c0f03a02dd69e4b138174"}, + {file = "rpds_py-0.20.0-cp312-none-win32.whl", hash = "sha256:8bc7690f7caee50b04a79bf017a8d020c1f48c2a1077ffe172abec59870f1139"}, + {file = "rpds_py-0.20.0-cp312-none-win_amd64.whl", hash = "sha256:0e13e6952ef264c40587d510ad676a988df19adea20444c2b295e536457bc585"}, + {file = "rpds_py-0.20.0-cp313-cp313-macosx_10_12_x86_64.whl", hash = "sha256:aa9a0521aeca7d4941499a73ad7d4f8ffa3d1affc50b9ea11d992cd7eff18a29"}, + {file = "rpds_py-0.20.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4a1f1d51eccb7e6c32ae89243cb352389228ea62f89cd80823ea7dd1b98e0b91"}, + {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a86a9b96070674fc88b6f9f71a97d2c1d3e5165574615d1f9168ecba4cecb24"}, + {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:6c8ef2ebf76df43f5750b46851ed1cdf8f109d7787ca40035fe19fbdc1acc5a7"}, + {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b74b25f024b421d5859d156750ea9a65651793d51b76a2e9238c05c9d5f203a9"}, + {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:57eb94a8c16ab08fef6404301c38318e2c5a32216bf5de453e2714c964c125c8"}, + {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e1940dae14e715e2e02dfd5b0f64a52e8374a517a1e531ad9412319dc3ac7879"}, + {file = "rpds_py-0.20.0-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d20277fd62e1b992a50c43f13fbe13277a31f8c9f70d59759c88f644d66c619f"}, + {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:06db23d43f26478303e954c34c75182356ca9aa7797d22c5345b16871ab9c45c"}, + {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:b2a5db5397d82fa847e4c624b0c98fe59d2d9b7cf0ce6de09e4d2e80f8f5b3f2"}, + {file = "rpds_py-0.20.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5a35df9f5548fd79cb2f52d27182108c3e6641a4feb0f39067911bf2adaa3e57"}, + {file = "rpds_py-0.20.0-cp313-none-win32.whl", hash = "sha256:fd2d84f40633bc475ef2d5490b9c19543fbf18596dcb1b291e3a12ea5d722f7a"}, + {file = "rpds_py-0.20.0-cp313-none-win_amd64.whl", hash = "sha256:9bc2d153989e3216b0559251b0c260cfd168ec78b1fac33dd485750a228db5a2"}, + {file = "rpds_py-0.20.0-cp38-cp38-macosx_10_12_x86_64.whl", hash = "sha256:f2fbf7db2012d4876fb0d66b5b9ba6591197b0f165db8d99371d976546472a24"}, + {file = "rpds_py-0.20.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:1e5f3cd7397c8f86c8cc72d5a791071431c108edd79872cdd96e00abd8497d29"}, + {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce9845054c13696f7af7f2b353e6b4f676dab1b4b215d7fe5e05c6f8bb06f965"}, + {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c3e130fd0ec56cb76eb49ef52faead8ff09d13f4527e9b0c400307ff72b408e1"}, + {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b16aa0107ecb512b568244ef461f27697164d9a68d8b35090e9b0c1c8b27752"}, + {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:aa7f429242aae2947246587d2964fad750b79e8c233a2367f71b554e9447949c"}, + {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af0fc424a5842a11e28956e69395fbbeab2c97c42253169d87e90aac2886d751"}, + {file = "rpds_py-0.20.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b8c00a3b1e70c1d3891f0db1b05292747f0dbcfb49c43f9244d04c70fbc40eb8"}, + {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:40ce74fc86ee4645d0a225498d091d8bc61f39b709ebef8204cb8b5a464d3c0e"}, + {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:4fe84294c7019456e56d93e8ababdad5a329cd25975be749c3f5f558abb48253"}, + {file = "rpds_py-0.20.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:338ca4539aad4ce70a656e5187a3a31c5204f261aef9f6ab50e50bcdffaf050a"}, + {file = "rpds_py-0.20.0-cp38-none-win32.whl", hash = "sha256:54b43a2b07db18314669092bb2de584524d1ef414588780261e31e85846c26a5"}, + {file = "rpds_py-0.20.0-cp38-none-win_amd64.whl", hash = "sha256:a1862d2d7ce1674cffa6d186d53ca95c6e17ed2b06b3f4c476173565c862d232"}, + {file = "rpds_py-0.20.0-cp39-cp39-macosx_10_12_x86_64.whl", hash = "sha256:3fde368e9140312b6e8b6c09fb9f8c8c2f00999d1823403ae90cc00480221b22"}, + {file = "rpds_py-0.20.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:9824fb430c9cf9af743cf7aaf6707bf14323fb51ee74425c380f4c846ea70789"}, + {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:11ef6ce74616342888b69878d45e9f779b95d4bd48b382a229fe624a409b72c5"}, + {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:c52d3f2f82b763a24ef52f5d24358553e8403ce05f893b5347098014f2d9eff2"}, + {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9d35cef91e59ebbeaa45214861874bc6f19eb35de96db73e467a8358d701a96c"}, + {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d72278a30111e5b5525c1dd96120d9e958464316f55adb030433ea905866f4de"}, + {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b4c29cbbba378759ac5786730d1c3cb4ec6f8ababf5c42a9ce303dc4b3d08cda"}, + {file = "rpds_py-0.20.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6632f2d04f15d1bd6fe0eedd3b86d9061b836ddca4c03d5cf5c7e9e6b7c14580"}, + {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:d0b67d87bb45ed1cd020e8fbf2307d449b68abc45402fe1a4ac9e46c3c8b192b"}, + {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:ec31a99ca63bf3cd7f1a5ac9fe95c5e2d060d3c768a09bc1d16e235840861420"}, + {file = "rpds_py-0.20.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:22e6c9976e38f4d8c4a63bd8a8edac5307dffd3ee7e6026d97f3cc3a2dc02a0b"}, + {file = "rpds_py-0.20.0-cp39-none-win32.whl", hash = "sha256:569b3ea770c2717b730b61998b6c54996adee3cef69fc28d444f3e7920313cf7"}, + {file = "rpds_py-0.20.0-cp39-none-win_amd64.whl", hash = "sha256:e6900ecdd50ce0facf703f7a00df12374b74bbc8ad9fe0f6559947fb20f82364"}, + {file = "rpds_py-0.20.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:617c7357272c67696fd052811e352ac54ed1d9b49ab370261a80d3b6ce385045"}, + {file = "rpds_py-0.20.0-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:9426133526f69fcaba6e42146b4e12d6bc6c839b8b555097020e2b78ce908dcc"}, + {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:deb62214c42a261cb3eb04d474f7155279c1a8a8c30ac89b7dcb1721d92c3c02"}, + {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fcaeb7b57f1a1e071ebd748984359fef83ecb026325b9d4ca847c95bc7311c92"}, + {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d454b8749b4bd70dd0a79f428731ee263fa6995f83ccb8bada706e8d1d3ff89d"}, + {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d807dc2051abe041b6649681dce568f8e10668e3c1c6543ebae58f2d7e617855"}, + {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c3c20f0ddeb6e29126d45f89206b8291352b8c5b44384e78a6499d68b52ae511"}, + {file = "rpds_py-0.20.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b7f19250ceef892adf27f0399b9e5afad019288e9be756d6919cb58892129f51"}, + {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:4f1ed4749a08379555cebf4650453f14452eaa9c43d0a95c49db50c18b7da075"}, + {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:dcedf0b42bcb4cfff4101d7771a10532415a6106062f005ab97d1d0ab5681c60"}, + {file = "rpds_py-0.20.0-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:39ed0d010457a78f54090fafb5d108501b5aa5604cc22408fc1c0c77eac14344"}, + {file = "rpds_py-0.20.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:bb273176be34a746bdac0b0d7e4e2c467323d13640b736c4c477881a3220a989"}, + {file = "rpds_py-0.20.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f918a1a130a6dfe1d7fe0f105064141342e7dd1611f2e6a21cd2f5c8cb1cfb3e"}, + {file = "rpds_py-0.20.0-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:f60012a73aa396be721558caa3a6fd49b3dd0033d1675c6d59c4502e870fcf0c"}, + {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3d2b1ad682a3dfda2a4e8ad8572f3100f95fad98cb99faf37ff0ddfe9cbf9d03"}, + {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:614fdafe9f5f19c63ea02817fa4861c606a59a604a77c8cdef5aa01d28b97921"}, + {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fa518bcd7600c584bf42e6617ee8132869e877db2f76bcdc281ec6a4113a53ab"}, + {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0475242f447cc6cb8a9dd486d68b2ef7fbee84427124c232bff5f63b1fe11e5"}, + {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f90a4cd061914a60bd51c68bcb4357086991bd0bb93d8aa66a6da7701370708f"}, + {file = "rpds_py-0.20.0-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:def7400461c3a3f26e49078302e1c1b38f6752342c77e3cf72ce91ca69fb1bc1"}, + {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:65794e4048ee837494aea3c21a28ad5fc080994dfba5b036cf84de37f7ad5074"}, + {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:faefcc78f53a88f3076b7f8be0a8f8d35133a3ecf7f3770895c25f8813460f08"}, + {file = "rpds_py-0.20.0-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:5b4f105deeffa28bbcdff6c49b34e74903139afa690e35d2d9e3c2c2fba18cec"}, + {file = "rpds_py-0.20.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:fdfc3a892927458d98f3d55428ae46b921d1f7543b89382fdb483f5640daaec8"}, + {file = "rpds_py-0.20.0.tar.gz", hash = "sha256:d72a210824facfdaf8768cf2d7ca25a042c30320b3020de2fa04640920d4e121"}, +] + +[[package]] +name = "setuptools" +version = "75.1.0" +description = "Easily download, build, install, upgrade, and uninstall Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "setuptools-75.1.0-py3-none-any.whl", hash = "sha256:35ab7fd3bcd95e6b7fd704e4a1539513edad446c097797f2985e0e4b960772f2"}, + {file = "setuptools-75.1.0.tar.gz", hash = "sha256:d59a21b17a275fb872a9c3dae73963160ae079f1049ed956880cd7c09b120538"}, +] + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)", "ruff (>=0.5.2)"] +core = ["importlib-metadata (>=6)", "importlib-resources (>=5.10.2)", "jaraco.collections", "jaraco.functools", "jaraco.text (>=3.7)", "more-itertools", "more-itertools (>=8.8)", "packaging", "packaging (>=24)", "platformdirs (>=2.6.2)", "tomli (>=2.0.1)", "wheel (>=0.43.0)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "pygments-github-lexers (==0.0.5)", "pyproject-hooks (!=1.1)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-favicon", "sphinx-inline-tabs", "sphinx-lint", "sphinx-notfound-page (>=1,<2)", "sphinx-reredirects", "sphinxcontrib-towncrier", "towncrier (<24.7)"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["build[virtualenv] (>=1.0.3)", "filelock (>=3.4.0)", "ini2toml[lite] (>=0.14)", "jaraco.develop (>=7.21)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "jaraco.test", "packaging (>=23.2)", "pip (>=19.1)", "pyproject-hooks (!=1.1)", "pytest (>=6,!=8.1.*)", "pytest-home (>=0.5)", "pytest-perf", "pytest-subprocess", "pytest-timeout", "pytest-xdist (>=3)", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel (>=0.44.0)"] +type = ["importlib-metadata (>=7.0.2)", "jaraco.develop (>=7.21)", "mypy (==1.11.*)", "pytest-mypy"] + +[[package]] +name = "shtab" +version = "1.7.1" +description = "Automagic shell tab completion for Python CLI applications" +optional = false +python-versions = ">=3.7" +files = [ + {file = "shtab-1.7.1-py3-none-any.whl", hash = "sha256:32d3d2ff9022d4c77a62492b6ec875527883891e33c6b479ba4d41a51e259983"}, + {file = "shtab-1.7.1.tar.gz", hash = "sha256:4e4bcb02eeb82ec45920a5d0add92eac9c9b63b2804c9196c1f1fdc2d039243c"}, +] + +[package.extras] +dev = ["pytest (>=6)", "pytest-cov", "pytest-timeout"] + +[[package]] +name = "six" +version = "1.16.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" +files = [ + {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, + {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, +] + +[[package]] +name = "smmap" +version = "5.0.1" +description = "A pure Python implementation of a sliding window memory map manager" +optional = false +python-versions = ">=3.7" +files = [ + {file = "smmap-5.0.1-py3-none-any.whl", hash = "sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da"}, + {file = "smmap-5.0.1.tar.gz", hash = "sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62"}, +] + +[[package]] +name = "soupsieve" +version = "2.6" +description = "A modern CSS selector implementation for Beautiful Soup." +optional = false +python-versions = ">=3.8" +files = [ + {file = "soupsieve-2.6-py3-none-any.whl", hash = "sha256:e72c4ff06e4fb6e4b5a9f0f55fe6e81514581fca1515028625d0f299c602ccc9"}, + {file = "soupsieve-2.6.tar.gz", hash = "sha256:e2e68417777af359ec65daac1057404a3c8a5455bb8abc36f1a9866ab1a51abb"}, +] + +[[package]] +name = "stack-data" +version = "0.6.3" +description = "Extract data from python stack frames and tracebacks for informative displays" +optional = false +python-versions = "*" +files = [ + {file = "stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695"}, + {file = "stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9"}, +] + +[package.dependencies] +asttokens = ">=2.1.0" +executing = ">=1.2.0" +pure-eval = "*" + +[package.extras] +tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] + +[[package]] +name = "super-collections" +version = "0.5.3" +description = "file: README.md" +optional = false +python-versions = ">=3.8" +files = [ + {file = "super_collections-0.5.3-py3-none-any.whl", hash = "sha256:907d35b25dc4070910e8254bf2f5c928348af1cf8a1f1e8259e06c666e902cff"}, + {file = "super_collections-0.5.3.tar.gz", hash = "sha256:94c1ec96c0a0d5e8e7d389ed8cde6882ac246940507c5e6b86e91945c2968d46"}, +] + +[package.dependencies] +hjson = "*" + +[package.extras] +test = ["pytest (>=7.0)"] + +[[package]] +name = "termcolor" +version = "2.5.0" +description = "ANSI color formatting for output in terminal" +optional = false +python-versions = ">=3.9" +files = [ + {file = "termcolor-2.5.0-py3-none-any.whl", hash = "sha256:37b17b5fc1e604945c2642c872a3764b5d547a48009871aea3edd3afa180afb8"}, + {file = "termcolor-2.5.0.tar.gz", hash = "sha256:998d8d27da6d48442e8e1f016119076b690d962507531df4890fcd2db2ef8a6f"}, +] + +[package.extras] +tests = ["pytest", "pytest-cov"] + +[[package]] +name = "tinycss2" +version = "1.3.0" +description = "A tiny CSS parser" +optional = false +python-versions = ">=3.8" +files = [ + {file = "tinycss2-1.3.0-py3-none-any.whl", hash = "sha256:54a8dbdffb334d536851be0226030e9505965bb2f30f21a4a82c55fb2a80fae7"}, + {file = "tinycss2-1.3.0.tar.gz", hash = "sha256:152f9acabd296a8375fbca5b84c961ff95971fcfc32e79550c8df8e29118c54d"}, +] + +[package.dependencies] +webencodings = ">=0.4" + +[package.extras] +doc = ["sphinx", "sphinx_rtd_theme"] +test = ["pytest", "ruff"] + +[[package]] +name = "tomli" +version = "2.0.2" +description = "A lil' TOML parser" +optional = false +python-versions = ">=3.8" +files = [ + {file = "tomli-2.0.2-py3-none-any.whl", hash = "sha256:2ebe24485c53d303f690b0ec092806a085f07af5a5aa1464f3931eec36caaa38"}, + {file = "tomli-2.0.2.tar.gz", hash = "sha256:d46d457a85337051c36524bc5349dd91b1877838e2979ac5ced3e710ed8a60ed"}, +] + +[[package]] +name = "tornado" +version = "6.4.1" +description = "Tornado is a Python web framework and asynchronous networking library, originally developed at FriendFeed." +optional = false +python-versions = ">=3.8" +files = [ + {file = "tornado-6.4.1-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:163b0aafc8e23d8cdc3c9dfb24c5368af84a81e3364745ccb4427669bf84aec8"}, + {file = "tornado-6.4.1-cp38-abi3-macosx_10_9_x86_64.whl", hash = "sha256:6d5ce3437e18a2b66fbadb183c1d3364fb03f2be71299e7d10dbeeb69f4b2a14"}, + {file = "tornado-6.4.1-cp38-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e2e20b9113cd7293f164dc46fffb13535266e713cdb87bd2d15ddb336e96cfc4"}, + {file = "tornado-6.4.1-cp38-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8ae50a504a740365267b2a8d1a90c9fbc86b780a39170feca9bcc1787ff80842"}, + {file = "tornado-6.4.1-cp38-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:613bf4ddf5c7a95509218b149b555621497a6cc0d46ac341b30bd9ec19eac7f3"}, + {file = "tornado-6.4.1-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:25486eb223babe3eed4b8aecbac33b37e3dd6d776bc730ca14e1bf93888b979f"}, + {file = "tornado-6.4.1-cp38-abi3-musllinux_1_2_i686.whl", hash = "sha256:454db8a7ecfcf2ff6042dde58404164d969b6f5d58b926da15e6b23817950fc4"}, + {file = "tornado-6.4.1-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a02a08cc7a9314b006f653ce40483b9b3c12cda222d6a46d4ac63bb6c9057698"}, + {file = "tornado-6.4.1-cp38-abi3-win32.whl", hash = "sha256:d9a566c40b89757c9aa8e6f032bcdb8ca8795d7c1a9762910c722b1635c9de4d"}, + {file = "tornado-6.4.1-cp38-abi3-win_amd64.whl", hash = "sha256:b24b8982ed444378d7f21d563f4180a2de31ced9d8d84443907a0a64da2072e7"}, + {file = "tornado-6.4.1.tar.gz", hash = "sha256:92d3ab53183d8c50f8204a51e6f91d18a15d5ef261e84d452800d4ff6fc504e9"}, +] + +[[package]] +name = "traitlets" +version = "5.14.3" +description = "Traitlets Python configuration system" +optional = false +python-versions = ">=3.8" +files = [ + {file = "traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f"}, + {file = "traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7"}, +] + +[package.extras] +docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] +test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0,<8.2)", "pytest-mock", "pytest-mypy-testing"] + +[[package]] +name = "typing-extensions" +version = "4.12.2" +description = "Backported and Experimental Type Hints for Python 3.8+" +optional = false +python-versions = ">=3.8" +files = [ + {file = "typing_extensions-4.12.2-py3-none-any.whl", hash = "sha256:04e5ca0351e0f3f85c6853954072df659d0d13fac324d0072316b67d7794700d"}, + {file = "typing_extensions-4.12.2.tar.gz", hash = "sha256:1a7ead55c7e559dd4dee8856e3a88b41225abfe1ce8df57b7c13915fe121ffb8"}, +] + +[[package]] +name = "urllib3" +version = "2.2.3" +description = "HTTP library with thread-safe connection pooling, file post, and more." +optional = false +python-versions = ">=3.8" +files = [ + {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"}, + {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"}, +] + +[package.extras] +brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +h2 = ["h2 (>=4,<5)"] +socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] +zstd = ["zstandard (>=0.18.0)"] + +[[package]] +name = "verspec" +version = "0.1.0" +description = "Flexible version handling" +optional = false +python-versions = "*" +files = [ + {file = "verspec-0.1.0-py3-none-any.whl", hash = "sha256:741877d5633cc9464c45a469ae2a31e801e6dbbaa85b9675d481cda100f11c31"}, + {file = "verspec-0.1.0.tar.gz", hash = "sha256:c4504ca697b2056cdb4bfa7121461f5a0e81809255b41c03dda4ba823637c01e"}, +] + +[package.extras] +test = ["coverage", "flake8 (>=3.7)", "mypy", "pretend", "pytest"] + +[[package]] +name = "watchdog" +version = "5.0.3" +description = "Filesystem events monitoring" +optional = false +python-versions = ">=3.9" +files = [ + {file = "watchdog-5.0.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:85527b882f3facda0579bce9d743ff7f10c3e1e0db0a0d0e28170a7d0e5ce2ea"}, + {file = "watchdog-5.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:53adf73dcdc0ef04f7735066b4a57a4cd3e49ef135daae41d77395f0b5b692cb"}, + {file = "watchdog-5.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e25adddab85f674acac303cf1f5835951345a56c5f7f582987d266679979c75b"}, + {file = "watchdog-5.0.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f01f4a3565a387080dc49bdd1fefe4ecc77f894991b88ef927edbfa45eb10818"}, + {file = "watchdog-5.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:91b522adc25614cdeaf91f7897800b82c13b4b8ac68a42ca959f992f6990c490"}, + {file = "watchdog-5.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d52db5beb5e476e6853da2e2d24dbbbed6797b449c8bf7ea118a4ee0d2c9040e"}, + {file = "watchdog-5.0.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:94d11b07c64f63f49876e0ab8042ae034674c8653bfcdaa8c4b32e71cfff87e8"}, + {file = "watchdog-5.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:349c9488e1d85d0a58e8cb14222d2c51cbc801ce11ac3936ab4c3af986536926"}, + {file = "watchdog-5.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:53a3f10b62c2d569e260f96e8d966463dec1a50fa4f1b22aec69e3f91025060e"}, + {file = "watchdog-5.0.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:950f531ec6e03696a2414b6308f5c6ff9dab7821a768c9d5788b1314e9a46ca7"}, + {file = "watchdog-5.0.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ae6deb336cba5d71476caa029ceb6e88047fc1dc74b62b7c4012639c0b563906"}, + {file = "watchdog-5.0.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1021223c08ba8d2d38d71ec1704496471ffd7be42cfb26b87cd5059323a389a1"}, + {file = "watchdog-5.0.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:752fb40efc7cc8d88ebc332b8f4bcbe2b5cc7e881bccfeb8e25054c00c994ee3"}, + {file = "watchdog-5.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:a2e8f3f955d68471fa37b0e3add18500790d129cc7efe89971b8a4cc6fdeb0b2"}, + {file = "watchdog-5.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b8ca4d854adcf480bdfd80f46fdd6fb49f91dd020ae11c89b3a79e19454ec627"}, + {file = "watchdog-5.0.3-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:90a67d7857adb1d985aca232cc9905dd5bc4803ed85cfcdcfcf707e52049eda7"}, + {file = "watchdog-5.0.3-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:720ef9d3a4f9ca575a780af283c8fd3a0674b307651c1976714745090da5a9e8"}, + {file = "watchdog-5.0.3-pp39-pypy39_pp73-macosx_10_15_x86_64.whl", hash = "sha256:223160bb359281bb8e31c8f1068bf71a6b16a8ad3d9524ca6f523ac666bb6a1e"}, + {file = "watchdog-5.0.3-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:560135542c91eaa74247a2e8430cf83c4342b29e8ad4f520ae14f0c8a19cfb5b"}, + {file = "watchdog-5.0.3-py3-none-manylinux2014_aarch64.whl", hash = "sha256:dd021efa85970bd4824acacbb922066159d0f9e546389a4743d56919b6758b91"}, + {file = "watchdog-5.0.3-py3-none-manylinux2014_armv7l.whl", hash = "sha256:78864cc8f23dbee55be34cc1494632a7ba30263951b5b2e8fc8286b95845f82c"}, + {file = "watchdog-5.0.3-py3-none-manylinux2014_i686.whl", hash = "sha256:1e9679245e3ea6498494b3028b90c7b25dbb2abe65c7d07423ecfc2d6218ff7c"}, + {file = "watchdog-5.0.3-py3-none-manylinux2014_ppc64.whl", hash = "sha256:9413384f26b5d050b6978e6fcd0c1e7f0539be7a4f1a885061473c5deaa57221"}, + {file = "watchdog-5.0.3-py3-none-manylinux2014_ppc64le.whl", hash = "sha256:294b7a598974b8e2c6123d19ef15de9abcd282b0fbbdbc4d23dfa812959a9e05"}, + {file = "watchdog-5.0.3-py3-none-manylinux2014_s390x.whl", hash = "sha256:26dd201857d702bdf9d78c273cafcab5871dd29343748524695cecffa44a8d97"}, + {file = "watchdog-5.0.3-py3-none-manylinux2014_x86_64.whl", hash = "sha256:0f9332243355643d567697c3e3fa07330a1d1abf981611654a1f2bf2175612b7"}, + {file = "watchdog-5.0.3-py3-none-win32.whl", hash = "sha256:c66f80ee5b602a9c7ab66e3c9f36026590a0902db3aea414d59a2f55188c1f49"}, + {file = "watchdog-5.0.3-py3-none-win_amd64.whl", hash = "sha256:f00b4cf737f568be9665563347a910f8bdc76f88c2970121c86243c8cfdf90e9"}, + {file = "watchdog-5.0.3-py3-none-win_ia64.whl", hash = "sha256:49f4d36cb315c25ea0d946e018c01bb028048023b9e103d3d3943f58e109dd45"}, + {file = "watchdog-5.0.3.tar.gz", hash = "sha256:108f42a7f0345042a854d4d0ad0834b741d421330d5f575b81cb27b883500176"}, +] + +[package.extras] +watchmedo = ["PyYAML (>=3.10)"] + +[[package]] +name = "wcmatch" +version = "10.0" +description = "Wildcard/glob file name matcher." +optional = false +python-versions = ">=3.8" +files = [ + {file = "wcmatch-10.0-py3-none-any.whl", hash = "sha256:0dd927072d03c0a6527a20d2e6ad5ba8d0380e60870c383bc533b71744df7b7a"}, + {file = "wcmatch-10.0.tar.gz", hash = "sha256:e72f0de09bba6a04e0de70937b0cf06e55f36f37b3deb422dfaf854b867b840a"}, +] + +[package.dependencies] +bracex = ">=2.1.1" + +[[package]] +name = "wcwidth" +version = "0.2.13" +description = "Measures the displayed width of unicode strings in a terminal" +optional = false +python-versions = "*" +files = [ + {file = "wcwidth-0.2.13-py2.py3-none-any.whl", hash = "sha256:3da69048e4540d84af32131829ff948f1e022c1c6bdb8d6102117aac784f6859"}, + {file = "wcwidth-0.2.13.tar.gz", hash = "sha256:72ea0c06399eb286d978fdedb6923a9eb47e1c486ce63e9b4e64fc18303972b5"}, +] + +[[package]] +name = "webencodings" +version = "0.5.1" +description = "Character encoding aliases for legacy web content" +optional = false +python-versions = "*" +files = [ + {file = "webencodings-0.5.1-py2.py3-none-any.whl", hash = "sha256:a0af1213f3c2226497a97e2b3aa01a7e4bee4f403f95be16fc9acd2947514a78"}, + {file = "webencodings-0.5.1.tar.gz", hash = "sha256:b36a1c245f2d304965eb4e0a82848379241dc04b865afcc4aab16748587e1923"}, +] + +[[package]] +name = "zipp" +version = "3.20.2" +description = "Backport of pathlib-compatible object wrapper for zip files" +optional = false +python-versions = ">=3.8" +files = [ + {file = "zipp-3.20.2-py3-none-any.whl", hash = "sha256:a817ac80d6cf4b23bf7f2828b7cabf326f15a001bea8b1f9b49631780ba28350"}, + {file = "zipp-3.20.2.tar.gz", hash = "sha256:bc9eb26f4506fda01b81bcde0ca78103b6e62f991b381fec825435c836edbc29"}, +] + +[package.extras] +check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1)"] +cover = ["pytest-cov"] +doc = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.linker (>=1.9)", "sphinx (>=3.5)", "sphinx-lint"] +enabler = ["pytest-enabler (>=2.2)"] +test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", "jaraco.test", "more-itertools", "pytest (>=6,!=8.1.*)", "pytest-ignore-flaky"] +type = ["pytest-mypy"] + +[metadata] +lock-version = "2.0" +python-versions = "^3.10" +content-hash = "e2fd4c717a0d0c95bc3cea08104489573a6b78bb681b429f509798d73e4539c5" diff --git a/docs/pyproject.toml b/docs/pyproject.toml new file mode 100644 index 00000000..3b0adbcf --- /dev/null +++ b/docs/pyproject.toml @@ -0,0 +1,34 @@ +[tool.poetry] +name = "icechunk-docs" +description = "Icechunk documentation website" +authors = ["Orestis Herodotou "] +readme = "README.md" + +# Disable package mode +package-mode = false + +[tool.poetry.dependencies] +python = "^3.10" +mkdocs = "^1.6.1" +mkdocs-material = {extras = ["imaging"], version = "^9.5.39"} +mkdocstrings = {extras = ["python"], version = "^0.26.2"} +mkdocs-jupyter = "^0.25.0" +mkdocs-awesome-pages-plugin = "^2.9.3" +mkdocs-git-revision-date-localized-plugin = "^1.2.9" +mkdocs-git-committers-plugin-2 = "^2.4.1" +mkdocs-macros-plugin = "^1.2.0" +mkdocs-include-markdown-plugin = "^6.2.2" +mkdocs-open-in-new-tab = "^1.0.6" +mkdocs-redirects = "^1.2.1" +mkdocs-breadcrumbs-plugin = "^0.1.10" +mkdocs-minify-plugin = "^0.8.0" +mkdocs-mermaid2-plugin = "^1.1.1" +mkdocs-git-authors-plugin = "^0.9.0" + +[tool.poetry.group.dev.dependencies] +mike = "^2.1.3" +shtab = "^1.7.1" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/icechunk-python/Cargo.toml b/icechunk-python/Cargo.toml index febaa271..1d5d5d75 100644 --- a/icechunk-python/Cargo.toml +++ b/icechunk-python/Cargo.toml @@ -1,8 +1,16 @@ [package] name = "icechunk-python" -version = "0.1.0" +version = "0.1.0-alpha.3" +description = "Transactional storage engine for Zarr designed for use on cloud object storage" +readme = "../README.md" +repository = "https://github.com/earth-mover/icechunk" +homepage = "https://github.com/earth-mover/icechunk" +license = "Apache-2.0" +keywords = ["zarr", "xarray", "database"] +categories = ["database", "science", "science::geo"] +authors = ["Earthmover PBC"] edition = "2021" -publish = false +publish = true # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [lib] @@ -13,7 +21,7 @@ crate-type = ["cdylib"] bytes = "1.7.2" chrono = { version = "0.4.38" } futures = "0.3.30" -icechunk = { path = "../icechunk", version = "0.1.0" } +icechunk = { path = "../icechunk", version = "0.1.0-alpha.3" } pyo3 = { version = "0.21", features = [ "chrono", "extension-module", @@ -23,6 +31,7 @@ pyo3-asyncio-0-21 = { version = "0.21.0", features = ["tokio-runtime"] } async-stream = "0.3.5" thiserror = "1.0.64" tokio = "1.40" +serde_json = "1.0.128" [lints] workspace = true diff --git a/icechunk-python/README.md b/icechunk-python/README.md index a7f2e1fd..c316a3ac 100644 --- a/icechunk-python/README.md +++ b/icechunk-python/README.md @@ -34,11 +34,11 @@ pip install -e icechunk@. Now you can create or open an icechunk store for use with `zarr-python`: ```python -from icechunk import IcechunkStore, Storage +from icechunk import IcechunkStore, StorageConfig from zarr import Array, Group -storage = Storage.memory("test") -store = await IcechunkStore.open(storage=storage, mode='r+') +storage = StorageConfig.memory("test") +store = IcechunkStore.open_or_create(storage=storage, mode='r+') root = Group.from_store(store=store, zarr_format=zarr_format) foo = root.create_array("foo", shape=(100,), chunks=(10,), dtype="i4") diff --git a/icechunk-python/examples/dask_write.py b/icechunk-python/examples/dask_write.py new file mode 100644 index 00000000..7d10e6a0 --- /dev/null +++ b/icechunk-python/examples/dask_write.py @@ -0,0 +1,328 @@ +""" +This example uses Dask to write or update an array in an Icechunk repository. + +To understand all the available options run: +``` +python ./examples/dask_write.py --help +python ./examples/dask_write.py create --help +python ./examples/dask_write.py update --help +python ./examples/dask_write.py verify --help +``` + +Example usage: + +``` +python ./examples/dask_write.py create --url s3://my-bucket/my-icechunk-repo --t-chunks 100000 --x-chunks 4 --y-chunks 4 --chunk-x-size 112 --chunk-y-size 112 +python ./examples/dask_write.py update --url s3://my-bucket/my-icechunk-repo --t-from 0 --t-to 1500 --workers 16 +python ./examples/dask_write.py verify --url s3://my-bucket/my-icechunk-repo --t-from 0 --t-to 1500 --workers 16 +``` + +The work is split into three different commands. +* `create` initializes the repository and the array, without writing any chunks. For this example + we chose a 3D array that simulates a dataset that needs backfilling across its time dimension. +* `update` can be called multiple times to write a number of "pancakes" to the array. + It does so by distributing the work among Dask workers, in small tasks, one pancake per task. + The example invocation above, will write 1,500 pancakes using 16 Dask workers. +* `verify` can read a part of the array and check that it contains the required data. + +Icechunk can do distributed writes to object store, but currently, it cannot use the Dask array API +(we are working on it, see https://github.com/earth-mover/icechunk/issues/185). +Dask can still be used to read and write to Icechunk from multiple processes and machines, we just need to use a lower level +Dask API based, for example, in `map/gather`. This mechanism is what we show in this example. +""" + +import argparse +from dataclasses import dataclass +from typing import Any, cast +from urllib.parse import urlparse + +import icechunk +import numpy as np +import zarr +from dask.distributed import Client +from dask.distributed import print as dprint + + +@dataclass +class Task: + """A task distributed to Dask workers""" + store: icechunk.IcechunkStore # The worker will use this Icechunk store to read/write to the dataset + time: int # The position in the coordinate dimension where the read/write should happen + seed: int # An RNG seed used to generate or recreate random data for the array + + +def generate_task_array(task: Task, shape: tuple[int,...]) -> np.typing.ArrayLike: + """Generates a randm array with the given shape and using the seed in the Task""" + np.random.seed(task.seed) + return np.random.rand(*shape) + + +def execute_write_task(task: Task) -> icechunk.IcechunkStore: + """Execute task as a write task. + + This will read the time coordinade from `task` and write a "pancake" in that position, + using random data. Random data is generated using the task seed. + + Returns the Icechunk store after the write is done. + + As you can see Icechunk stores can be passed to remote workers, and returned from them. + The reason to return the store is that we'll need all the remote stores, when they are + done, to be able to do a single, global commit to Icechunk. + """ + + store = task.store + + group = zarr.group(store=store, overwrite=False) + array = cast(zarr.Array, group["array"]) + dprint(f"Writing at t={task.time}") + data = generate_task_array(task, array.shape[0:2]) + array[:, :, task.time] = data + dprint(f"Writing at t={task.time} done") + return store + + +def execute_read_task(task: Task) -> None: + """Execute task as a read task. + + This will read the time coordinade from `task` and read a "pancake" in that position. + Then it will assert the data is valid by re-generating the random data from the passed seed. + + As you can see Icechunk stores can be passed to remote workers. + """ + + store = task.store + group = zarr.group(store=store, overwrite=False) + array = cast(zarr.Array, group["array"]) + + actual = array[:, :, task.time] + expected = generate_task_array(task, array.shape[0:2]) + np.testing.assert_array_equal(actual, expected) + dprint(f"t={task.time} verified") + + +def storage_config(args: argparse.Namespace) -> dict[str, Any]: + """Return the Icechunk store S3 configuration map""" + bucket = args.url.netloc + prefix = args.url.path[1:] + return { + "bucket": bucket, + "prefix": prefix, + } + + +def store_config(args: argparse.Namespace) -> dict[str, Any]: + """Return the Icechunk store configuration. + + We lower the default to make sure we write chunks and not inline them. + """ + return {"inline_chunk_threshold_bytes": 1} + + +def create(args: argparse.Namespace) -> None: + """Execute the create subcommand. + + Creates an Icechunk store, a root group and an array named "array" + with the shape passed as arguments. + + Commits the Icechunk repository when done. + """ + store = icechunk.IcechunkStore.open_or_create( + storage=icechunk.StorageConfig.s3_from_env(**storage_config(args)), + mode="w", + config=icechunk.StoreConfig(**store_config(args)), + ) + + group = zarr.group(store=store, overwrite=True) + shape = ( + args.x_chunks * args.chunk_x_size, + args.y_chunks * args.chunk_y_size, + args.t_chunks * 1, + ) + chunk_shape = (args.chunk_x_size, args.chunk_y_size, 1) + + group.create_array( + "array", + shape=shape, + chunk_shape=chunk_shape, + dtype="f8", + fill_value=float("nan"), + ) + _first_snapshot = store.commit("array created") + print("Array initialized") + + +def update(args: argparse.Namespace) -> None: + """Execute the update subcommand. + + Uses Dask to write chunks to the Icechunk repository. Currently Icechunk cannot + use the Dask array API (see https://github.com/earth-mover/icechunk/issues/185) but we + can still use a lower level API to do the writes: + * We split the work into small `Task`s, one 'pancake' per task, at a given t coordinate. + * We use Dask's `map` to ship the `Task` to a worker + * The `Task` includes a copy of the Icechunk Store, so workers can do the writes + * When workers are done, they send their store back + * When all workers are done (Dask's `gather`), we take all Stores and do a distributed commit in Icechunk + """ + storage_conf = storage_config(args) + store_conf = store_config(args) + + store = icechunk.IcechunkStore.open_or_create( + storage=icechunk.StorageConfig.s3_from_env(**storage_conf), + mode="r+", + config=icechunk.StoreConfig(**store_conf), + ) + + group = zarr.group(store=store, overwrite=False) + array = cast(zarr.Array, group["array"]) + print(f"Found an array with shape: {array.shape}") + + tasks = [ + Task( + store=store, + time=time, + seed=time, + ) + for time in range(args.t_from, args.t_to, 1) + ] + + client = Client(n_workers=args.workers, threads_per_worker=1) + + map_result = client.map(execute_write_task, tasks) + worker_stores = client.gather(map_result) + + print("Starting distributed commit") + # we can use the current store as the commit coordinator, because it doesn't have any pending changes, + # all changes come from the tasks, Icechunk doesn't care about where the changes come from, the only + # important thing is to not count changes twice + commit_res = store.distributed_commit("distributed commit", [ws.change_set_bytes() for ws in worker_stores]) + assert commit_res + print("Distributed commit done") + + +def verify(args: argparse.Namespace) -> None: + """Execute the verify subcommand. + + Uses Dask to read and verify chunks from the Icechunk repository. Currently Icechunk cannot + use the Dask array API (see https://github.com/earth-mover/icechunk/issues/185) but we + can still use a lower level API to do the verification: + * We split the work into small `Task`s, one 'pancake' per task, at a given t coordinate. + * We use Dask's `map` to ship the `Task` to a worker + * The `Task` includes a copy of the Icechunk Store, so workers can do the Icechunk reads + """ + storage_conf = storage_config(args) + store_conf = store_config(args) + + store = icechunk.IcechunkStore.open_or_create( + storage=icechunk.StorageConfig.s3_from_env(**storage_conf), + mode="r", + config=icechunk.StoreConfig(**store_conf), + ) + + group = zarr.group(store=store, overwrite=False) + array = cast(zarr.Array, group["array"]) + print(f"Found an array with shape: {array.shape}") + + tasks = [ + Task( + store=store, + time=time, + seed=time, + ) + for time in range(args.t_from, args.t_to, 1) + ] + + client = Client(n_workers=args.workers, threads_per_worker=1) + + map_result = client.map(execute_read_task, tasks) + client.gather(map_result) + print("done, all good") + + +def main() -> None: + """Main entry point for the script. + + Parses arguments and delegates to a subcommand. + """ + + global_parser = argparse.ArgumentParser(prog="dask_write") + global_parser.add_argument("--url", type=str, help="url for the repository: s3://bucket/optional-prefix/repository-name", required=True) + subparsers = global_parser.add_subparsers(title="subcommands", required=True) + + create_parser = subparsers.add_parser("create", help="create repo and array") + create_parser.add_argument( + "--x-chunks", type=int, help="number of chunks in the x dimension", default=4 + ) + create_parser.add_argument( + "--y-chunks", type=int, help="number of chunks in the y dimension", default=4 + ) + create_parser.add_argument( + "--t-chunks", type=int, help="number of chunks in the t dimension", default=1000 + ) + create_parser.add_argument( + "--chunk-x-size", + type=int, + help="size of chunks in the x dimension", + default=112, + ) + create_parser.add_argument( + "--chunk-y-size", + type=int, + help="size of chunks in the y dimension", + default=112, + ) + create_parser.set_defaults(command="create") + + update_parser = subparsers.add_parser("update", help="add chunks to the array") + update_parser.add_argument( + "--t-from", + type=int, + help="time position where to start adding chunks (included)", + required=True, + ) + update_parser.add_argument( + "--t-to", + type=int, + help="time position where to stop adding chunks (not included)", + required=True, + ) + update_parser.add_argument( + "--workers", type=int, help="number of workers to use", required=True + ) + update_parser.set_defaults(command="update") + + verify_parser = subparsers.add_parser("verify", help="verify array chunks") + verify_parser.add_argument( + "--t-from", + type=int, + help="time position where to start adding chunks (included)", + required=True, + ) + verify_parser.add_argument( + "--t-to", + type=int, + help="time position where to stop adding chunks (not included)", + required=True, + ) + verify_parser.add_argument( + "--workers", type=int, help="number of workers to use", required=True + ) + verify_parser.set_defaults(command="verify") + + args = global_parser.parse_args() + url = urlparse(args.url, "s3") + if url.scheme != "s3" or url.netloc == '' or url.path == '' or url.params != '' or url.query != '' or url.fragment != '': + raise ValueError(f"Invalid url {args.url}") + + args.url = url + + match args.command: + case "create": + create(args) + case "update": + update(args) + case "verify": + verify(args) + + +if __name__ == "__main__": + main() diff --git a/icechunk-python/examples/smoke-test.py b/icechunk-python/examples/smoke-test.py index c9bae991..a7926c32 100644 --- a/icechunk-python/examples/smoke-test.py +++ b/icechunk-python/examples/smoke-test.py @@ -1,17 +1,15 @@ import asyncio -from typing import Literal -from zarr.store import LocalStore, MemoryStore, RemoteStore import math +import random +import string +import time +from typing import Literal import numpy as np import zarr -import time - +from icechunk import IcechunkStore, S3Credentials, StorageConfig, StoreConfig from zarr.abc.store import Store - -from icechunk import IcechunkStore, Storage, S3Credentials, StoreConfig -import random -import string +from zarr.storage import LocalStore, MemoryStore, RemoteStore def rdms(n): @@ -45,7 +43,7 @@ def create_array(*, group, name, size, dtype, fill_value) -> np.ndarray: array, chunk_shape = generate_array_chunks(size=size, dtype=dtype) - group.create_array( + group.require_array( name=name, shape=array.shape, dtype=array.dtype, @@ -68,7 +66,7 @@ async def run(store: Store) -> None: first_commit = None if isinstance(store, IcechunkStore): - first_commit = await store.commit("initial commit") + first_commit = store.commit("initial commit") expected = {} expected["root-foo"] = create_array( @@ -81,32 +79,32 @@ async def run(store: Store) -> None: group["root-foo"].attrs["update"] = "new attr" if isinstance(store, IcechunkStore): - _second_commit = await store.commit("added array, updated attr") + _second_commit = store.commit("added array, updated attr") assert len(group["root-foo"].attrs) == 2 assert len(group.members()) == 1 if isinstance(store, IcechunkStore) and first_commit is not None: - await store.checkout(first_commit) + store.checkout(first_commit) group.attrs["update"] = "new attr 2" if isinstance(store, IcechunkStore): try: - await store.commit("new attr 2") + store.commit("new attr 2") except ValueError: pass else: raise ValueError("should have conflicted") - await store.reset() # FIXME: WHY - await store.checkout(branch="main") + store.reset() + store.checkout(branch="main") group["root-foo"].attrs["update"] = "new attr 2" if isinstance(store, IcechunkStore): - _third_commit = await store.commit("new attr 2") + _third_commit = store.commit("new attr 2") try: - await store.commit("rewrote array") + store.commit("rewrote array") except ValueError: pass else: @@ -136,7 +134,7 @@ async def run(store: Store) -> None: fill_value=-1234, ) if isinstance(store, IcechunkStore): - _fourth_commit = await store.commit("added groups and arrays") + _fourth_commit = store.commit("added groups and arrays") print(f"Write done in {time.time() - write_start} secs") @@ -149,16 +147,16 @@ async def run(store: Store) -> None: assert isinstance(array, zarr.Array) print( - f"numchunks: {math.prod(s // c for s, c in zip(array.shape, array.chunks))}" + f"numchunks: {math.prod(s // c for s, c in zip(array.shape, array.chunks, strict=False))}" ) np.testing.assert_array_equal(array[:], value) print(f"Read done in {time.time() - read_start} secs") -async def create_icechunk_store(*, storage: Storage) -> IcechunkStore: - return await IcechunkStore.create( - storage=storage, mode="r+", config=StoreConfig(inline_chunk_threshold=1) +def create_icechunk_store(*, storage: StorageConfig) -> IcechunkStore: + return IcechunkStore.open_or_create( + storage=storage, mode="w", config=StoreConfig(inline_chunk_threshold_bytes=1) ) @@ -171,13 +169,18 @@ async def create_zarr_store(*, store: Literal["memory", "local", "s3"]) -> Store return RemoteStore.from_url( "s3://testbucket/root-zarr", mode="w", - storage_options={"endpoint_url": "http://localhost:9000"}, + storage_options={ + "anon": False, + "key": "minio123", + "secret": "minio123", + "endpoint_url": "http://localhost:9000", + }, ) if __name__ == "__main__": - MEMORY = Storage.memory("new") - MINIO = Storage.s3_from_credentials( + MEMORY = StorageConfig.memory("new") + MINIO = StorageConfig.s3_from_config( bucket="testbucket", prefix="root-icechunk", credentials=S3Credentials( @@ -185,15 +188,13 @@ async def create_zarr_store(*, store: Literal["memory", "local", "s3"]) -> Store secret_access_key="minio123", session_token=None, ), + region="us-east-1", + allow_http=True, endpoint_url="http://localhost:9000", ) - S3 = Storage.s3_from_env( - bucket="icechunk-test", - prefix="demo-repository", - ) print("Icechunk store") - store = asyncio.run(create_icechunk_store(storage=MINIO)) + store = create_icechunk_store(storage=MINIO) asyncio.run(run(store)) print("Zarr store") diff --git a/icechunk-python/notebooks/demo-dummy-data.ipynb b/icechunk-python/notebooks/demo-dummy-data.ipynb index a81147e2..a4fc771f 100644 --- a/icechunk-python/notebooks/demo-dummy-data.ipynb +++ b/icechunk-python/notebooks/demo-dummy-data.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "2abaa80e-f07f-4e6d-b322-0ed280ec77e8", "metadata": {}, "outputs": [], @@ -21,8 +21,7 @@ "\n", "import numpy as np\n", "import zarr\n", - "\n", - "from icechunk import IcechunkStore, Storage" + "from icechunk import IcechunkStore, StorageConfig" ] }, { @@ -37,25 +36,24 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "4f40240b-eb4b-408b-8fd9-bb4e5a60a34d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "store = await IcechunkStore.create(\n", - " storage=Storage.memory(\"icechunk-demo\"),\n", - " mode=\"w\",\n", + "store = IcechunkStore.create(\n", + " storage=StorageConfig.memory(\"icechunk-demo\"),\n", ")\n", "store" ] @@ -70,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "8fa3197a-0674-431f-9dc1-c59fab055cc0", "metadata": {}, "outputs": [], @@ -88,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "9690e567-2494-4421-bf47-d9e442c4975f", "metadata": {}, "outputs": [], @@ -145,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "34a6780e-379d-47ec-bc2a-b599cfab105a", "metadata": {}, "outputs": [ @@ -155,7 +153,7 @@ "{'foo': 'foo'}" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -176,23 +174,23 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "d43060dd-6678-45f0-91ed-6786dea6cfa7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'2471WQXNN789CTT07172HDDBQG'" + "'M419JDES7SDXBA6NCT4G'" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "first_commit = await store.commit(\"wrote a root group attribute\")\n", + "first_commit = store.commit(\"wrote a root group attribute\")\n", "first_commit" ] }, @@ -208,7 +206,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "bcaf1dec-65de-4572-ac05-a470ce45e100", "metadata": {}, "outputs": [], @@ -224,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "aae0f3e8-2db7-437a-8d67-11f07aa47d14", "metadata": {}, "outputs": [ @@ -232,52 +230,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "store_path \n", - "{\n", - " \"shape\": [\n", - " 5,\n", - " 5,\n", - " 64,\n", - " 128\n", - " ],\n", - " \"data_type\": \"int32\",\n", - " \"chunk_grid\": {\n", - " \"name\": \"regular\",\n", - " \"configuration\": {\n", - " \"chunk_shape\": [\n", - " 1,\n", - " 2,\n", - " 8,\n", - " 2\n", - " ]\n", - " }\n", - " },\n", - " \"chunk_key_encoding\": {\n", - " \"name\": \"default\",\n", - " \"configuration\": {\n", - " \"separator\": \"/\"\n", - " }\n", - " },\n", - " \"fill_value\": -1,\n", - " \"codecs\": [\n", - " {\n", - " \"name\": \"bytes\",\n", - " \"configuration\": {\n", - " \"endian\": \"little\"\n", - " }\n", - " }\n", - " ],\n", - " \"dimension_names\": [\n", - " \"x\",\n", - " \"y\",\n", - " \"z\",\n", - " \"t\"\n", - " ],\n", - " \"attributes\": {\n", - " \"description\": \"icechunk test data\"\n", - " }\n", - "}\n", - "(('root-foo', /root-foo shape=(5, 5, 64, 128) dtype=int32>),)\n" + "(('root-foo', /root-foo shape=(5, 5, 64, 128) dtype=int32>),)\n" ] } ], @@ -358,191 +311,42 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 9, "id": "78b45ec7-ead8-46c5-b553-476abbd2bca4", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"shape\": [\n", - " 5,\n", - " 5,\n", - " 64,\n", - " 128\n", - " ],\n", - " \"data_type\": \"int32\",\n", - " \"chunk_grid\": {\n", - " \"name\": \"regular\",\n", - " \"configuration\": {\n", - " \"chunk_shape\": [\n", - " 1,\n", - " 2,\n", - " 8,\n", - " 2\n", - " ]\n", - " }\n", - " },\n", - " \"chunk_key_encoding\": {\n", - " \"name\": \"default\",\n", - " \"configuration\": {\n", - " \"separator\": \"/\"\n", - " }\n", - " },\n", - " \"fill_value\": -1,\n", - " \"codecs\": [\n", - " {\n", - " \"name\": \"bytes\",\n", - " \"configuration\": {\n", - " \"endian\": \"little\"\n", - " }\n", - " }\n", - " ],\n", - " \"dimension_names\": [\n", - " \"x\",\n", - " \"y\",\n", - " \"z\",\n", - " \"t\"\n", - " ],\n", - " \"attributes\": {\n", - " \"description\": \"icechunk test data\"\n", - " }\n", - "}\n" - ] - } - ], + "outputs": [], "source": [ "root_group[\"root-foo\"].attrs[\"update\"] = \"new attr\"" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 10, "id": "2399312c-d53f-443f-8be1-b8702ba6513e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'SQ4T6Y45DXY9F0EYXE7ECBWHPC'" + "'V3SFRWRM255Z3JC3SYH0'" ] }, - "execution_count": 12, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "second_commit = await store.commit(\"added array, updated attr\")\n", + "second_commit = store.commit(\"added array, updated attr\")\n", "second_commit" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "id": "edad201d-d9b3-4825-887a-1e6b3bf07e57", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"shape\": [\n", - " 5,\n", - " 5,\n", - " 64,\n", - " 128\n", - " ],\n", - " \"data_type\": \"int32\",\n", - " \"chunk_grid\": {\n", - " \"name\": \"regular\",\n", - " \"configuration\": {\n", - " \"chunk_shape\": [\n", - " 1,\n", - " 2,\n", - " 8,\n", - " 2\n", - " ]\n", - " }\n", - " },\n", - " \"chunk_key_encoding\": {\n", - " \"name\": \"default\",\n", - " \"configuration\": {\n", - " \"separator\": \"/\"\n", - " }\n", - " },\n", - " \"fill_value\": -1,\n", - " \"codecs\": [\n", - " {\n", - " \"name\": \"bytes\",\n", - " \"configuration\": {\n", - " \"endian\": \"little\"\n", - " }\n", - " }\n", - " ],\n", - " \"dimension_names\": [\n", - " \"x\",\n", - " \"y\",\n", - " \"z\",\n", - " \"t\"\n", - " ],\n", - " \"attributes\": {\n", - " \"description\": \"icechunk test data\",\n", - " \"update\": \"new attr\"\n", - " }\n", - "}\n", - "store_path \n", - "{\n", - " \"shape\": [\n", - " 5,\n", - " 5,\n", - " 64,\n", - " 128\n", - " ],\n", - " \"data_type\": \"int32\",\n", - " \"chunk_grid\": {\n", - " \"name\": \"regular\",\n", - " \"configuration\": {\n", - " \"chunk_shape\": [\n", - " 1,\n", - " 2,\n", - " 8,\n", - " 2\n", - " ]\n", - " }\n", - " },\n", - " \"chunk_key_encoding\": {\n", - " \"name\": \"default\",\n", - " \"configuration\": {\n", - " \"separator\": \"/\"\n", - " }\n", - " },\n", - " \"fill_value\": -1,\n", - " \"codecs\": [\n", - " {\n", - " \"name\": \"bytes\",\n", - " \"configuration\": {\n", - " \"endian\": \"little\"\n", - " }\n", - " }\n", - " ],\n", - " \"dimension_names\": [\n", - " \"x\",\n", - " \"y\",\n", - " \"z\",\n", - " \"t\"\n", - " ],\n", - " \"attributes\": {\n", - " \"description\": \"icechunk test data\",\n", - " \"update\": \"new attr\"\n", - " }\n", - "}\n" - ] - } - ], + "outputs": [], "source": [ "assert len(root_group[\"root-foo\"].attrs) == 2\n", "assert len(root_group.members()) == 1" @@ -553,12 +357,12 @@ "id": "f2b6a3c6-9518-4ec4-921f-5303d4e851c7", "metadata": {}, "source": [ - "### Commiting when not on `HEAD` will fail." + "### Committing when not on `HEAD` will fail." ] }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 15, "id": "d904f719-98cf-4f51-8e9a-1631dcb3fcba", "metadata": {}, "outputs": [ @@ -571,11 +375,11 @@ } ], "source": [ - "await store.checkout(first_commit)\n", + "store.checkout(first_commit)\n", "root_group.attrs[\"update\"] = \"new attr 2\"\n", "\n", "try:\n", - " await store.commit(\"new attr 2\")\n", + " store.commit(\"new attr 2\")\n", "except ValueError as e:\n", " print(e)\n", "else:\n", @@ -592,100 +396,49 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 16, "id": "d31009db-8f99-48f1-b7bb-3f66875575cc", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "{\n", - " \"shape\": [\n", - " 5,\n", - " 5,\n", - " 64,\n", - " 128\n", - " ],\n", - " \"data_type\": \"int32\",\n", - " \"chunk_grid\": {\n", - " \"name\": \"regular\",\n", - " \"configuration\": {\n", - " \"chunk_shape\": [\n", - " 1,\n", - " 2,\n", - " 8,\n", - " 2\n", - " ]\n", - " }\n", - " },\n", - " \"chunk_key_encoding\": {\n", - " \"name\": \"default\",\n", - " \"configuration\": {\n", - " \"separator\": \"/\"\n", - " }\n", - " },\n", - " \"fill_value\": -1,\n", - " \"codecs\": [\n", - " {\n", - " \"name\": \"bytes\",\n", - " \"configuration\": {\n", - " \"endian\": \"little\"\n", - " }\n", - " }\n", - " ],\n", - " \"dimension_names\": [\n", - " \"x\",\n", - " \"y\",\n", - " \"z\",\n", - " \"t\"\n", - " ],\n", - " \"attributes\": {\n", - " \"description\": \"icechunk test data\",\n", - " \"update\": \"new attr\"\n", - " }\n", - "}\n" - ] - }, { "data": { "text/plain": [ - "'DZNCW2X281JE5PXSE15N85THAW'" + "'5QGW2PE1A5MTRZED190G'" ] }, - "execution_count": 19, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "await store.reset()\n", - "await store.checkout(branch=\"main\")\n", + "store.reset()\n", + "store.checkout(branch=\"main\")\n", "root_group[\"root-foo\"].attrs[\"update\"] = \"new attr 2\"\n", - "third_commit = await store.commit(\"new attr 2\")\n", + "third_commit = store.commit(\"new attr 2\")\n", "third_commit" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 17, "id": "03f8d62b-d8a7-452c-b086-340bfcb76d50", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'DP8CF4KE7XYHVD0H1GPZ8H58V0'" + "'ARWA72NB2MAH90JJ285G'" ] }, - "execution_count": 21, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "root_group.attrs[\"update\"] = \"new attr 2\"\n", - "fourth_commit = await store.commit(\"rewrote array\")\n", + "fourth_commit = store.commit(\"rewrote array\")\n", "fourth_commit" ] }, @@ -699,7 +452,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 18, "id": "aee87354-4c44-4428-a4bf-d38d99b7e608", "metadata": {}, "outputs": [ @@ -709,7 +462,7 @@ "{'root-foo': dtype('int32')}" ] }, - "execution_count": 22, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -720,17 +473,17 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 19, "id": "f389f3f9-03d5-4625-9856-145e065785f2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'3ZBWXTZEYPJH8MEZVKM5MW7S0G'" + "'G1DMNFF0W1RCEEPY09B0'" ] }, - "execution_count": 23, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -747,7 +500,7 @@ "expected[\"group2/foo3\"] = create_array(\n", " group=newgroup, name=\"foo3\", dtype=np.int64, size=1 * 1024 * 32, fill_value=-1234\n", ")\n", - "fifth_commit = await store.commit(\"added groups and arrays\")\n", + "fifth_commit = store.commit(\"added groups and arrays\")\n", "fifth_commit" ] }, @@ -761,7 +514,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 20, "id": "bc9d1ef4-2c06-4147-ad4d-9e8051ac4ea8", "metadata": {}, "outputs": [], @@ -777,23 +530,23 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 21, "id": "4264bbfa-4193-45e9-bc82-932f488bff28", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'B33DVM1FBXFYB0S1EVH8SHG29G'" + "'RVZSK0518F73E6RSY990'" ] }, - "execution_count": 25, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "await store.commit(\"overwrote root-foo\")" + "store.commit(\"overwrote root-foo\")" ] }, { @@ -806,73 +559,20 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 22, "id": "895faf9f-c1ec-4b9b-9676-f6b1745d73de", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "store_path \n", - "{\n", - " \"shape\": [\n", - " 4,\n", - " 4,\n", - " 64,\n", - " 128\n", - " ],\n", - " \"data_type\": \"int32\",\n", - " \"chunk_grid\": {\n", - " \"name\": \"regular\",\n", - " \"configuration\": {\n", - " \"chunk_shape\": [\n", - " 1,\n", - " 2,\n", - " 8,\n", - " 2\n", - " ]\n", - " }\n", - " },\n", - " \"chunk_key_encoding\": {\n", - " \"name\": \"default\",\n", - " \"configuration\": {\n", - " \"separator\": \"/\"\n", - " }\n", - " },\n", - " \"fill_value\": -1,\n", - " \"codecs\": [\n", - " {\n", - " \"name\": \"bytes\",\n", - " \"configuration\": {\n", - " \"endian\": \"little\"\n", - " }\n", - " }\n", - " ],\n", - " \"dimension_names\": [\n", - " \"x\",\n", - " \"y\",\n", - " \"z\",\n", - " \"t\"\n", - " ],\n", - " \"attributes\": {\n", - " \"description\": \"icechunk test data\"\n", - " }\n", - "}\n" - ] - }, { "data": { "text/plain": [ - "(('group1',\n", - " Group(_async_group=/group1>)),\n", + "(('group2', /group2>),\n", + " ('group1', /group1>),\n", " ('root-foo',\n", - " /root-foo shape=(4, 4, 64, 128) dtype=int32>),\n", - " ('group2',\n", - " Group(_async_group=/group2>)))" + " /root-foo shape=(4, 4, 64, 128) dtype=int32>))" ] }, - "execution_count": 26, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1002,69 +702,18 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 23, "id": "1fc3f29a-5915-4c66-bfed-5b75389e44e2", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "store_path group2\n", - "{\n", - " \"shape\": [\n", - " 2,\n", - " 2,\n", - " 64,\n", - " 128\n", - " ],\n", - " \"data_type\": \"int64\",\n", - " \"chunk_grid\": {\n", - " \"name\": \"regular\",\n", - " \"configuration\": {\n", - " \"chunk_shape\": [\n", - " 1,\n", - " 1,\n", - " 8,\n", - " 2\n", - " ]\n", - " }\n", - " },\n", - " \"chunk_key_encoding\": {\n", - " \"name\": \"default\",\n", - " \"configuration\": {\n", - " \"separator\": \"/\"\n", - " }\n", - " },\n", - " \"fill_value\": -1234,\n", - " \"codecs\": [\n", - " {\n", - " \"name\": \"bytes\",\n", - " \"configuration\": {\n", - " \"endian\": \"little\"\n", - " }\n", - " }\n", - " ],\n", - " \"dimension_names\": [\n", - " \"x\",\n", - " \"y\",\n", - " \"z\",\n", - " \"t\"\n", - " ],\n", - " \"attributes\": {\n", - " \"description\": \"icechunk test data\"\n", - " }\n", - "}\n" - ] - }, { "data": { "text/plain": [ "(('foo3',\n", - " /group2/foo3 shape=(2, 2, 64, 128) dtype=int64>),)" + " /group2/foo3 shape=(2, 2, 64, 128) dtype=int64>),)" ] }, - "execution_count": 28, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1083,7 +732,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 24, "id": "fb608382-04e8-4deb-8e2e-3f130845cf8c", "metadata": {}, "outputs": [ @@ -1091,62 +740,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "{\n", - " \"shape\": [\n", - " 2,\n", - " 2,\n", - " 64,\n", - " 128\n", - " ],\n", - " \"data_type\": \"int64\",\n", - " \"chunk_grid\": {\n", - " \"name\": \"regular\",\n", - " \"configuration\": {\n", - " \"chunk_shape\": [\n", - " 1,\n", - " 1,\n", - " 8,\n", - " 2\n", - " ]\n", - " }\n", - " },\n", - " \"chunk_key_encoding\": {\n", - " \"name\": \"default\",\n", - " \"configuration\": {\n", - " \"separator\": \"/\"\n", - " }\n", - " },\n", - " \"fill_value\": -1234,\n", - " \"codecs\": [\n", - " {\n", - " \"name\": \"bytes\",\n", - " \"configuration\": {\n", - " \"endian\": \"little\"\n", - " }\n", - " }\n", - " ],\n", - " \"dimension_names\": [\n", - " \"x\",\n", - " \"y\",\n", - " \"z\",\n", - " \"t\"\n", - " ],\n", - " \"attributes\": {\n", - " \"description\": \"icechunk test data\"\n", - " }\n", - "}\n", - "/group2/foo3 shape=(2, 2, 64, 128) dtype=int64>\n", - "/group2/foo3 shape=(4, 2, 64, 128) dtype=int64>\n", + "/group2/foo3 shape=(2, 2, 64, 128) dtype=int64>\n", + "/group2/foo3 shape=(4, 2, 64, 128) dtype=int64>\n", "[ 0 16384]\n" ] }, { "data": { "text/plain": [ - "'JG601CAP09Q7P19RQ7JSH3AWNR'" + "'JHCPX1W73WZV399MYQZ0'" ] }, - "execution_count": 29, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -1155,13 +760,13 @@ "array = root_group[\"group2/foo3\"]\n", "print(array)\n", "\n", - "array = array.resize((array.shape[0]*2, *array.shape[1:]))\n", + "array = array.resize((array.shape[0] * 2, *array.shape[1:]))\n", "print(array)\n", - "array[array.shape[0]//2:, ...] = expected[\"group2/foo3\"]\n", - "print(array[2:, 0,0,0])\n", - "expected[\"group2/foo3\"] = np.concatenate([expected[\"group2/foo3\"]]*2, axis=0)\n", + "array[array.shape[0] // 2 :, ...] = expected[\"group2/foo3\"]\n", + "print(array[2:, 0, 0, 0])\n", + "expected[\"group2/foo3\"] = np.concatenate([expected[\"group2/foo3\"]] * 2, axis=0)\n", "\n", - "await store.commit(\"appended to group2/foo3\")" + "store.commit(\"appended to group2/foo3\")" ] }, { @@ -1174,224 +779,21 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 26, "id": "820cb181-06cb-4ee2-af5b-f5904a147b32", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "root-foo\n", - "{\n", - " \"shape\": [\n", - " 4,\n", - " 4,\n", - " 64,\n", - " 128\n", - " ],\n", - " \"data_type\": \"int32\",\n", - " \"chunk_grid\": {\n", - " \"name\": \"regular\",\n", - " \"configuration\": {\n", - " \"chunk_shape\": [\n", - " 1,\n", - " 2,\n", - " 8,\n", - " 2\n", - " ]\n", - " }\n", - " },\n", - " \"chunk_key_encoding\": {\n", - " \"name\": \"default\",\n", - " \"configuration\": {\n", - " \"separator\": \"/\"\n", - " }\n", - " },\n", - " \"fill_value\": -1,\n", - " \"codecs\": [\n", - " {\n", - " \"name\": \"bytes\",\n", - " \"configuration\": {\n", - " \"endian\": \"little\"\n", - " }\n", - " }\n", - " ],\n", - " \"dimension_names\": [\n", - " \"x\",\n", - " \"y\",\n", - " \"z\",\n", - " \"t\"\n", - " ],\n", - " \"attributes\": {\n", - " \"description\": \"icechunk test data\"\n", - " }\n", - "}\n", - "numchunks: 4096\n", - "0.486346960067749\n", - "group1/foo1\n", - "{\n", - " \"shape\": [\n", - " 4,\n", - " 4,\n", - " 64,\n", - " 128\n", - " ],\n", - " \"data_type\": \"float32\",\n", - " \"chunk_grid\": {\n", - " \"name\": \"regular\",\n", - " \"configuration\": {\n", - " \"chunk_shape\": [\n", - " 1,\n", - " 2,\n", - " 8,\n", - " 2\n", - " ]\n", - " }\n", - " },\n", - " \"chunk_key_encoding\": {\n", - " \"name\": \"default\",\n", - " \"configuration\": {\n", - " \"separator\": \"/\"\n", - " }\n", - " },\n", - " \"fill_value\": -1234.0,\n", - " \"codecs\": [\n", - " {\n", - " \"name\": \"bytes\",\n", - " \"configuration\": {\n", - " \"endian\": \"little\"\n", - " }\n", - " }\n", - " ],\n", - " \"dimension_names\": [\n", - " \"x\",\n", - " \"y\",\n", - " \"z\",\n", - " \"t\"\n", - " ],\n", - " \"attributes\": {\n", - " \"description\": \"icechunk test data\"\n", - " }\n", - "}\n", - "numchunks: 4096\n", - "0.42878198623657227\n", - "group1/foo2\n", - "{\n", - " \"shape\": [\n", - " 2,\n", - " 2,\n", - " 64,\n", - " 128\n", - " ],\n", - " \"data_type\": \"float16\",\n", - " \"chunk_grid\": {\n", - " \"name\": \"regular\",\n", - " \"configuration\": {\n", - " \"chunk_shape\": [\n", - " 1,\n", - " 1,\n", - " 8,\n", - " 2\n", - " ]\n", - " }\n", - " },\n", - " \"chunk_key_encoding\": {\n", - " \"name\": \"default\",\n", - " \"configuration\": {\n", - " \"separator\": \"/\"\n", - " }\n", - " },\n", - " \"fill_value\": -1234.0,\n", - " \"codecs\": [\n", - " {\n", - " \"name\": \"bytes\",\n", - " \"configuration\": {\n", - " \"endian\": \"little\"\n", - " }\n", - " }\n", - " ],\n", - " \"dimension_names\": [\n", - " \"x\",\n", - " \"y\",\n", - " \"z\",\n", - " \"t\"\n", - " ],\n", - " \"attributes\": {\n", - " \"description\": \"icechunk test data\"\n", - " }\n", - "}\n", - "numchunks: 2048\n", - "0.2633249759674072\n", - "group2/foo3\n", - "{\n", - " \"shape\": [\n", - " 4,\n", - " 2,\n", - " 64,\n", - " 128\n", - " ],\n", - " \"data_type\": \"int64\",\n", - " \"chunk_grid\": {\n", - " \"name\": \"regular\",\n", - " \"configuration\": {\n", - " \"chunk_shape\": [\n", - " 1,\n", - " 1,\n", - " 8,\n", - " 2\n", - " ]\n", - " }\n", - " },\n", - " \"chunk_key_encoding\": {\n", - " \"name\": \"default\",\n", - " \"configuration\": {\n", - " \"separator\": \"/\"\n", - " }\n", - " },\n", - " \"fill_value\": -1234,\n", - " \"codecs\": [\n", - " {\n", - " \"name\": \"bytes\",\n", - " \"configuration\": {\n", - " \"endian\": \"little\"\n", - " }\n", - " }\n", - " ],\n", - " \"dimension_names\": [\n", - " \"x\",\n", - " \"y\",\n", - " \"z\",\n", - " \"t\"\n", - " ],\n", - " \"attributes\": {\n", - " \"description\": \"icechunk test data\"\n", - " }\n", - "}\n", - "numchunks: 4096\n", - "0.4318978786468506\n" - ] - } - ], + "outputs": [], "source": [ - "import time\n", + "# import time\n", "\n", - "for key, value in expected.items():\n", - " print(key)\n", - " tic = time.time()\n", - " array = root_group[key]\n", - " assert array.dtype == value.dtype, (array.dtype, value.dtype)\n", - " print(f\"numchunks: {math.prod(s // c for s, c in zip(array.shape, array.chunks))}\")\n", - " np.testing.assert_array_equal(array[:], value)\n", - " print(time.time() - tic)" - ] - }, - { - "cell_type": "markdown", - "id": "4c728fd3-4dc0-4b23-91c8-7cfe0537050b", - "metadata": {}, - "source": [ - "change values of \"group1/foo1\"" + "# for key, value in expected.items():\n", + "# print(key)\n", + "# tic = time.time()\n", + "# array = root_group[key]\n", + "# assert array.dtype == value.dtype, (array.dtype, value.dtype)\n", + "# print(f\"numchunks: {math.prod(s // c for s, c in zip(array.shape, array.chunks, strict=False))}\")\n", + "# np.testing.assert_array_equal(array[:], value)\n", + "# print(time.time() - tic)" ] } ], diff --git a/icechunk-python/notebooks/demo-s3.ipynb b/icechunk-python/notebooks/demo-s3.ipynb index 21ce86b7..e69e3bfa 100644 --- a/icechunk-python/notebooks/demo-s3.ipynb +++ b/icechunk-python/notebooks/demo-s3.ipynb @@ -18,8 +18,7 @@ "outputs": [], "source": [ "import zarr\n", - "\n", - "from icechunk import IcechunkStore, Storage" + "from icechunk import IcechunkStore, StorageConfig" ] }, { @@ -39,7 +38,9 @@ "metadata": {}, "outputs": [], "source": [ - "s3_storage = Storage.s3_from_env(bucket=\"icechunk-test\", prefix=\"oscar-demo-repository\")" + "s3_storage = StorageConfig.s3_from_env(\n", + " bucket=\"icechunk-test\", prefix=\"oscar-demo-repository\"\n", + ")" ] }, { @@ -49,7 +50,7 @@ "metadata": {}, "outputs": [], "source": [ - "store = await IcechunkStore.create(\n", + "store = IcechunkStore.create(\n", " storage=s3_storage,\n", " mode=\"w\",\n", ")" @@ -1063,16 +1064,16 @@ "text": [ "u\n", "D2YNJWWTKW6DY8ECPJZG\n", - "commited; 43.68043661117554 seconds\n", + "committed; 43.68043661117554 seconds\n", "um\n", "V0RSK39P1EXKB37F6Z10\n", - "commited; 44.08490180969238 seconds\n", + "committed; 44.08490180969238 seconds\n", "v\n", "JNDCHT5MF2MWRHYY8Q1G\n", - "commited; 61.78669619560242 seconds\n", + "committed; 61.78669619560242 seconds\n", "vm\n", "GAKXY70VJ2NQ3ANMEE10\n", - "commited; 55.72252893447876 seconds\n" + "committed; 55.72252893447876 seconds\n" ] } ], @@ -1085,14 +1086,14 @@ " group.create_array(\n", " name=var,\n", " shape=oscar[var].shape,\n", - " chunk_shape = (1, 1, 481, 1201),\n", + " chunk_shape=(1, 1, 481, 1201),\n", " fill_value=-1234567,\n", " dtype=oscar[var].dtype,\n", " data=oscar[var],\n", " exists_ok=True,\n", " )\n", - " print(await store.commit(f\"wrote {var}\"))\n", - " print(f\"commited; {time.time() - tic} seconds\")" + " print(store.commit(f\"wrote {var}\"))\n", + " print(f\"committed; {time.time() - tic} seconds\")" ] }, { @@ -1127,7 +1128,7 @@ } ], "source": [ - "[(sn.id, sn.message, sn.written_at) async for sn in store.ancestry()]" + "store.ancestry()" ] }, { @@ -1146,11 +1147,12 @@ "outputs": [], "source": [ "import zarr\n", - "\n", - "from icechunk import IcechunkStore, Storage\n", + "from icechunk import IcechunkStore, StorageConfig\n", "\n", "# TODO: catalog will handle this\n", - "s3_storage = Storage.s3_from_env(bucket=\"icechunk-test\", prefix=\"oscar-demo-repository\")" + "s3_storage = StorageConfig.s3_from_env(\n", + " bucket=\"icechunk-test\", prefix=\"oscar-demo-repository\"\n", + ")" ] }, { @@ -1171,7 +1173,7 @@ } ], "source": [ - "store = await IcechunkStore.open_existing(\n", + "store = IcechunkStore.open_existing(\n", " storage=s3_storage,\n", " mode=\"r\",\n", ")\n", @@ -1218,7 +1220,7 @@ } ], "source": [ - "[(sn.id, sn.message, sn.written_at) async for sn in store.ancestry()]" + "store.ancestry()" ] }, { @@ -1294,8 +1296,8 @@ "metadata": {}, "outputs": [], "source": [ - "import matplotlib.pyplot as plt\n", - "import matplotlib as mpl" + "import matplotlib as mpl\n", + "import matplotlib.pyplot as plt" ] }, { @@ -1339,9 +1341,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:icechunk]", + "display_name": ".venv", "language": "python", - "name": "conda-env-icechunk-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -1353,7 +1355,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.6" + "version": "3.11.9" } }, "nbformat": 4, diff --git a/icechunk-python/notebooks/memorystore.ipynb b/icechunk-python/notebooks/memorystore.ipynb index 98708ec5..70d727a7 100644 --- a/icechunk-python/notebooks/memorystore.ipynb +++ b/icechunk-python/notebooks/memorystore.ipynb @@ -34,7 +34,9 @@ } ], "source": [ - "store = await icechunk.IcechunkStore.create(storage=icechunk.Storage.memory(\"\"), mode=\"w\")\n", + "store = icechunk.IcechunkStore.create(\n", + " storage=icechunk.StorageConfig.memory(\"\")\n", + ")\n", "store" ] }, @@ -83,7 +85,9 @@ } ], "source": [ - "air_temp = group.create_array(\"air_temp\", shape=(1000, 1000), chunk_shape=(100, 100), dtype=\"i4\")\n", + "air_temp = group.create_array(\n", + " \"air_temp\", shape=(1000, 1000), chunk_shape=(100, 100), dtype=\"i4\"\n", + ")\n", "air_temp" ] }, @@ -159,7 +163,7 @@ } ], "source": [ - "snapshot_id = await store.commit(\"Initial commit\")\n", + "snapshot_id = store.commit(\"Initial commit\")\n", "snapshot_id" ] }, @@ -223,7 +227,7 @@ } ], "source": [ - "new_snapshot_id = await store.commit(\"Change air temp to 54\")\n", + "new_snapshot_id = store.commit(\"Change air temp to 54\")\n", "new_snapshot_id" ] }, @@ -251,7 +255,7 @@ } ], "source": [ - "await store.checkout(snapshot_id=snapshot_id)\n", + "store.checkout(snapshot_id=snapshot_id)\n", "air_temp[200, 6]" ] } diff --git a/icechunk-python/notebooks/performance/era5_xarray-Icechunk.ipynb b/icechunk-python/notebooks/performance/era5_xarray-Icechunk.ipynb new file mode 100644 index 00000000..18efd9c3 --- /dev/null +++ b/icechunk-python/notebooks/performance/era5_xarray-Icechunk.ipynb @@ -0,0 +1,1066 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "40c929a3-87d4-4c0e-a97d-1300d8adcae0", + "metadata": {}, + "source": [ + "# Icechunk Performance - Icechunk\n", + "\n", + "Using data from the [NCAR ERA5 AWS Public Dataset](https://nsf-ncar-era5.s3.amazonaws.com/index.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b2904d5f-090b-4344-a2f7-99096ba26d27", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xarray: 0.9.7.dev3734+g26081d4f\n", + "dask: 2024.9.1+8.g70f56e28\n", + "zarr: 3.0.0b0\n", + "icechunk: 0.1.0-alpha.1\n" + ] + } + ], + "source": [ + "import xarray as xr\n", + "import zarr\n", + "import dask\n", + "import fsspec\n", + "from dask.diagnostics import ProgressBar\n", + "\n", + "import icechunk\n", + "from icechunk import IcechunkStore, StorageConfig\n", + "\n", + "print('xarray: ', xr.__version__)\n", + "print('dask: ', dask.__version__)\n", + "print('zarr: ', zarr.__version__)\n", + "print('icechunk:', icechunk.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "5110e6d0-1c9a-4943-9f5d-a0d96bcbb5e0", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zarr.config.set(\n", + " {\n", + " 'threading.max_workers': 16,\n", + " 'async.concurrency': 128\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "081e1a71-873e-45c3-b77d-5b7aa1617286", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 246 ms, sys: 51.8 ms, total: 297 ms\n", + "Wall time: 2.22 s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/srv/conda/envs/icechunk-pip/lib/python3.12/site-packages/xarray/backends/api.py:357: UserWarning: The specified chunks separate the stored chunks along dimension \"time\" starting at index 1. This could degrade performance. Instead, consider rechunking after loading.\n", + " var_chunks = _get_chunk(var, chunks, chunkmanager)\n" + ] + } + ], + "source": [ + "url = \"https://nsf-ncar-era5.s3.amazonaws.com/e5.oper.an.pl/194106/e5.oper.an.pl.128_060_pv.ll025sc.1941060100_1941060123.nc\"\n", + "%time ds = xr.open_dataset(fsspec.open(url).open(), engine=\"h5netcdf\", chunks={\"time\": 1})\n", + "ds = ds.drop_encoding()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b3048527-c50f-451c-9500-cac6c22dd1bc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Size: 4GB\n", + "Dimensions: (time: 24, level: 37, latitude: 721, longitude: 1440)\n", + "Coordinates:\n", + " * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", + " * level (level) float64 296B 1.0 2.0 3.0 5.0 ... 925.0 950.0 975.0 1e+03\n", + " * longitude (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8\n", + " * time (time) datetime64[ns] 192B 1941-06-01 ... 1941-06-01T23:00:00\n", + "Data variables:\n", + " PV (time, level, latitude, longitude) float32 4GB dask.array\n", + " utc_date (time) int32 96B dask.array\n", + "Attributes:\n", + " DATA_SOURCE: ECMWF: https://cds.climate.copernicus.eu, Copernicu...\n", + " NETCDF_CONVERSION: CISL RDA: Conversion from ECMWF GRIB 1 data to netC...\n", + " NETCDF_VERSION: 4.8.1\n", + " CONVERSION_PLATFORM: Linux r1i4n4 4.12.14-95.51-default #1 SMP Fri Apr 1...\n", + " CONVERSION_DATE: Wed May 10 06:33:49 MDT 2023\n", + " Conventions: CF-1.6\n", + " NETCDF_COMPRESSION: NCO: Precision-preserving compression to netCDF4/HD...\n", + " history: Wed May 10 06:34:19 2023: ncks -4 --ppc default=7 e...\n", + " NCO: netCDF Operators version 5.0.3 (Homepage = http://n...\n" + ] + } + ], + "source": [ + "print(ds)" + ] + }, + { + "cell_type": "markdown", + "id": "7f4a801c-b570-45e3-b37f-2e140a2fb273", + "metadata": {}, + "source": [ + "### Load Data from HDF5 File\n", + "\n", + "This illustrates how loading directly from HDF5 files on S3 can be slow, even with Dask." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "29e344c6-a25e-4342-979f-d2d2c7aed7a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 53.73 ss\n" + ] + } + ], + "source": [ + "with ProgressBar():\n", + " dsl = ds.load()" + ] + }, + { + "cell_type": "markdown", + "id": "bdbd3f6c-e62c-4cfc-8cfb-b0fa22b6bddd", + "metadata": {}, + "source": [ + "### Initialize Icechunk Repo" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9283b1f5-a0e9-43ef-bd8a-5985bedc2d17", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prefix = \"ryan/icechunk-tests-era5-999\"\n", + "store = IcechunkStore.create(\n", + " storage=StorageConfig.s3_from_env(\n", + " bucket=\"icechunk-test\",\n", + " prefix=prefix\n", + " ),\n", + " mode=\"w\"\n", + ")\n", + "store" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b13b469d-45d7-4844-b153-b44d274cb220", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "('main', 'B8ZZN2YZS6NQKM17X68G')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "store.branch, store.snapshot_id" + ] + }, + { + "cell_type": "markdown", + "id": "12c4ce5a-f1dd-4576-9d89-071583cd92a4", + "metadata": {}, + "source": [ + "### Store Data To Icechunk\n", + "\n", + "We specify encoding to set both compression and chunk size." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "67c6389d-79a0-4992-b845-6a633cb4d86b", + "metadata": {}, + "outputs": [], + "source": [ + "encoding = {\n", + " \"PV\": {\n", + " \"codecs\": [zarr.codecs.BytesCodec(), zarr.codecs.ZstdCodec()],\n", + " \"chunks\": (1, 1, 721, 1440)\n", + " }\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "4e632068-fb29-4a6f-a3d0-d19edb8f68a2", + "metadata": {}, + "source": [ + "Note that Dask is not required to obtain good performance when reading and writing. Zarr and Icechunk use multithreading and asyncio internally." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b9a8c5ab-cc5a-4a05-b4ba-3b52be187e18", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 54 s, sys: 1.56 s, total: 55.5 s\n", + "Wall time: 18.9 s\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time dsl.to_zarr(store, zarr_format=3, consolidated=False, encoding=encoding)" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "bc33e613-7527-4f4f-92be-c1a20c2b8624", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 18.02 ss\n" + ] + } + ], + "source": [ + "# with ProgressBar():\n", + "# (dsl\n", + "# .chunk({\"time\": 1, \"level\": 10})\n", + "# .to_zarr(store, zarr_format=3, consolidated=False, encoding=encoding)\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b6b19d8b-3655-4213-99c9-5857c2ac126b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'AS64P9SQ7NY1P22P8GS0'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "store.commit(\"wrote data\")" + ] + }, + { + "cell_type": "markdown", + "id": "34b1a12c-9640-4f8b-a5fc-2ade040b437c", + "metadata": {}, + "source": [ + "### Read Data Back" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "e74e2d0e-c8ad-44ec-90b6-51de574aafa9", + "metadata": {}, + "outputs": [], + "source": [ + "store = IcechunkStore.open_existing(\n", + " storage=StorageConfig.s3_from_env(\n", + " bucket=\"icechunk-test\",\n", + " prefix=prefix\n", + " ),\n", + " mode=\"r\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a9c1bfc7-61d2-4a92-ab82-b026e7b9fcf6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 16.8 ms, sys: 2.45 ms, total: 19.2 ms\n", + "Wall time: 97.4 ms\n" + ] + } + ], + "source": [ + "%time dsic = xr.open_dataset(store, consolidated=False, engine=\"zarr\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "c09243a3-9965-4952-a7af-21f4e95697b9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Size: 4GB\n", + "Dimensions: (level: 37, latitude: 721, longitude: 1440, time: 24)\n", + "Coordinates:\n", + " * level (level) float64 296B 1.0 2.0 3.0 5.0 ... 925.0 950.0 975.0 1e+03\n", + " * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", + " * longitude (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8\n", + " * time (time) datetime64[ns] 192B 1941-06-01 ... 1941-06-01T23:00:00\n", + "Data variables:\n", + " PV (time, level, latitude, longitude) float32 4GB ...\n", + " utc_date (time) int32 96B ...\n", + "Attributes:\n", + " CONVERSION_DATE: Wed May 10 06:33:49 MDT 2023\n", + " CONVERSION_PLATFORM: Linux r1i4n4 4.12.14-95.51-default #1 SMP Fri Apr 1...\n", + " Conventions: CF-1.6\n", + " DATA_SOURCE: ECMWF: https://cds.climate.copernicus.eu, Copernicu...\n", + " NCO: netCDF Operators version 5.0.3 (Homepage = http://n...\n", + " NETCDF_COMPRESSION: NCO: Precision-preserving compression to netCDF4/HD...\n", + " NETCDF_CONVERSION: CISL RDA: Conversion from ECMWF GRIB 1 data to netC...\n", + " NETCDF_VERSION: 4.8.1\n", + " history: Wed May 10 06:34:19 2023: ncks -4 --ppc default=7 e...\n" + ] + } + ], + "source": [ + "print(dsic)" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "feb23457-c6fe-4363-8393-c92ab1ae7a89", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 16.8 ms, sys: 78 μs, total: 16.8 ms\n", + "Wall time: 102 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array(0.00710905, dtype=float32)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time dsic.PV[0, 0, 0, 0].values" + ] + }, + { + "cell_type": "markdown", + "id": "2eef8e3a-c0ce-4383-b76a-e852a50f7398", + "metadata": {}, + "source": [ + "As with writing, Dask is not required for performant reading of the data.\n", + "In this example we can load the entire dataset (nearly 4GB) in 8s. " + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d5103624-554c-4d18-a323-d24f82b99818", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 11 s, sys: 3.67 s, total: 14.7 s\n", + "Wall time: 2.03 s\n" + ] + } + ], + "source": [ + "%time _ = dsic.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "7782d02a-db34-4113-8fe6-6162a129d290", + "metadata": {}, + "outputs": [], + "source": [ + "xr.testing.assert_identical(_, ds)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "c2d61f3e-c4b6-4d52-a55f-6fad900d04db", + "metadata": {}, + "outputs": [], + "source": [ + "dsicc = dsic.chunk({\"time\": 1, \"level\": 10})" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "894fa53e-845a-41fe-a7a7-4cf859ea5928", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 2.13 sms\n" + ] + } + ], + "source": [ + "from dask.diagnostics import ProgressBar\n", + "with ProgressBar():\n", + " _ = dsicc.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "f8e13924-9daa-488c-be67-ab07ab4fcc99", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 4GB\n",
+       "Dimensions:    (latitude: 721, level: 37, time: 24, longitude: 1440)\n",
+       "Coordinates:\n",
+       "  * latitude   (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n",
+       "  * level      (level) float64 296B 1.0 2.0 3.0 5.0 ... 925.0 950.0 975.0 1e+03\n",
+       "  * longitude  (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8\n",
+       "  * time       (time) datetime64[ns] 192B 1941-06-01 ... 1941-06-01T23:00:00\n",
+       "Data variables:\n",
+       "    utc_date   (time) int32 96B 1941060100 1941060101 ... 1941060122 1941060123\n",
+       "    PV         (time, level, latitude, longitude) float32 4GB 0.007109 ... -1...\n",
+       "Attributes:\n",
+       "    CONVERSION_DATE:      Wed May 10 06:33:49 MDT 2023\n",
+       "    CONVERSION_PLATFORM:  Linux r1i4n4 4.12.14-95.51-default #1 SMP Fri Apr 1...\n",
+       "    Conventions:          CF-1.6\n",
+       "    DATA_SOURCE:          ECMWF: https://cds.climate.copernicus.eu, Copernicu...\n",
+       "    NCO:                  netCDF Operators version 5.0.3 (Homepage = http://n...\n",
+       "    NETCDF_COMPRESSION:   NCO: Precision-preserving compression to netCDF4/HD...\n",
+       "    NETCDF_CONVERSION:    CISL RDA: Conversion from ECMWF GRIB 1 data to netC...\n",
+       "    NETCDF_VERSION:       4.8.1\n",
+       "    history:              Wed May 10 06:34:19 2023: ncks -4 --ppc default=7 e...
" + ], + "text/plain": [ + " Size: 4GB\n", + "Dimensions: (latitude: 721, level: 37, time: 24, longitude: 1440)\n", + "Coordinates:\n", + " * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", + " * level (level) float64 296B 1.0 2.0 3.0 5.0 ... 925.0 950.0 975.0 1e+03\n", + " * longitude (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8\n", + " * time (time) datetime64[ns] 192B 1941-06-01 ... 1941-06-01T23:00:00\n", + "Data variables:\n", + " utc_date (time) int32 96B 1941060100 1941060101 ... 1941060122 1941060123\n", + " PV (time, level, latitude, longitude) float32 4GB 0.007109 ... -1...\n", + "Attributes:\n", + " CONVERSION_DATE: Wed May 10 06:33:49 MDT 2023\n", + " CONVERSION_PLATFORM: Linux r1i4n4 4.12.14-95.51-default #1 SMP Fri Apr 1...\n", + " Conventions: CF-1.6\n", + " DATA_SOURCE: ECMWF: https://cds.climate.copernicus.eu, Copernicu...\n", + " NCO: netCDF Operators version 5.0.3 (Homepage = http://n...\n", + " NETCDF_COMPRESSION: NCO: Precision-preserving compression to netCDF4/HD...\n", + " NETCDF_CONVERSION: CISL RDA: Conversion from ECMWF GRIB 1 data to netC...\n", + " NETCDF_VERSION: 4.8.1\n", + " history: Wed May 10 06:34:19 2023: ncks -4 --ppc default=7 e..." + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "actual = _\n", + "actual" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "29b08fa1-aefa-4b59-bb64-d31fe88d614a", + "metadata": {}, + "outputs": [], + "source": [ + "xr.testing.assert_identical(actual, dsl)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0cf1d7dc-44b4-4c92-bfe1-5fde04ac0b62", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/icechunk-python/notebooks/performance/era5_xarray-zarr2.ipynb b/icechunk-python/notebooks/performance/era5_xarray-zarr2.ipynb new file mode 100644 index 00000000..1ea40b00 --- /dev/null +++ b/icechunk-python/notebooks/performance/era5_xarray-zarr2.ipynb @@ -0,0 +1,443 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "40c929a3-87d4-4c0e-a97d-1300d8adcae0", + "metadata": {}, + "source": [ + "# Icechunk Performance - Zarr V2\n", + "\n", + "Using data from the [NCAR ERA5 AWS Public Dataset](https://nsf-ncar-era5.s3.amazonaws.com/index.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b2904d5f-090b-4344-a2f7-99096ba26d27", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xarray: 2024.7.0\n", + "dask: 2024.6.2\n", + "zarr: 2.18.2\n" + ] + } + ], + "source": [ + "import xarray as xr\n", + "import zarr\n", + "import dask\n", + "import fsspec\n", + "from dask.diagnostics import ProgressBar\n", + "\n", + "print('xarray: ', xr.__version__)\n", + "print('dask: ', dask.__version__)\n", + "print('zarr: ', zarr.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "081e1a71-873e-45c3-b77d-5b7aa1617286", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 123 ms, sys: 44.5 ms, total: 168 ms\n", + "Wall time: 1.91 s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/srv/conda/envs/notebook/lib/python3.12/site-packages/xarray/core/dataset.py:277: UserWarning: The specified chunks separate the stored chunks along dimension \"time\" starting at index 1. This could degrade performance. Instead, consider rechunking after loading.\n", + " warnings.warn(\n" + ] + } + ], + "source": [ + "url = \"https://nsf-ncar-era5.s3.amazonaws.com/e5.oper.an.pl/194106/e5.oper.an.pl.128_060_pv.ll025sc.1941060100_1941060123.nc\"\n", + "%time dsc = xr.open_dataset(fsspec.open(url).open(), engine=\"h5netcdf\", chunks={\"time\": 1}).drop_encoding()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b3048527-c50f-451c-9500-cac6c22dd1bc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Size: 4GB\n", + "Dimensions: (time: 24, level: 37, latitude: 721, longitude: 1440)\n", + "Coordinates:\n", + " * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", + " * level (level) float64 296B 1.0 2.0 3.0 5.0 ... 925.0 950.0 975.0 1e+03\n", + " * longitude (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8\n", + " * time (time) datetime64[ns] 192B 1941-06-01 ... 1941-06-01T23:00:00\n", + "Data variables:\n", + " PV (time, level, latitude, longitude) float32 4GB dask.array\n", + " utc_date (time) int32 96B dask.array\n", + "Attributes:\n", + " DATA_SOURCE: ECMWF: https://cds.climate.copernicus.eu, Copernicu...\n", + " NETCDF_CONVERSION: CISL RDA: Conversion from ECMWF GRIB 1 data to netC...\n", + " NETCDF_VERSION: 4.8.1\n", + " CONVERSION_PLATFORM: Linux r1i4n4 4.12.14-95.51-default #1 SMP Fri Apr 1...\n", + " CONVERSION_DATE: Wed May 10 06:33:49 MDT 2023\n", + " Conventions: CF-1.6\n", + " NETCDF_COMPRESSION: NCO: Precision-preserving compression to netCDF4/HD...\n", + " history: Wed May 10 06:34:19 2023: ncks -4 --ppc default=7 e...\n", + " NCO: netCDF Operators version 5.0.3 (Homepage = http://n...\n" + ] + } + ], + "source": [ + "print(ds)" + ] + }, + { + "cell_type": "markdown", + "id": "7f4a801c-b570-45e3-b37f-2e140a2fb273", + "metadata": {}, + "source": [ + "### Load Data from HDF5 File\n", + "\n", + "This illustrates how loading directly from HDF5 files on S3 can be slow, even with Dask." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "29e344c6-a25e-4342-979f-d2d2c7aed7a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 61.19 ss\n" + ] + } + ], + "source": [ + "with ProgressBar():\n", + " dsl = ds.load()" + ] + }, + { + "cell_type": "markdown", + "id": "bdbd3f6c-e62c-4cfc-8cfb-b0fa22b6bddd", + "metadata": {}, + "source": [ + "### Write Zarr Store - No Dask" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "67c6389d-79a0-4992-b845-6a633cb4d86b", + "metadata": {}, + "outputs": [], + "source": [ + "encoding = {\n", + " \"PV\": {\n", + " \"compressor\": zarr.Zstd(),\n", + " \"chunks\": (1, 1, 721, 1440)\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "bda3c3f9-4714-471b-abc0-051c3a6d8384", + "metadata": {}, + "outputs": [], + "source": [ + "target_url = \"s3://icechunk-test/ryan/zarr-v2/test-era5-11\"\n", + "store = zarr.storage.FSStore(target_url)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b9a8c5ab-cc5a-4a05-b4ba-3b52be187e18", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 21.4 s, sys: 3.73 s, total: 25.1 s\n", + "Wall time: 31.8 s\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time dsl.to_zarr(store, consolidated=False, encoding=encoding, mode=\"w\")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "3718012b-3157-47a1-8ac4-f72d27a2132f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 12.30 s\n" + ] + } + ], + "source": [ + "# with dask\n", + "dslc = dsl.chunk({\"time\": 1, \"level\": 1})\n", + "store_d = zarr.storage.FSStore(target_url + '-dask')\n", + "with ProgressBar():\n", + " dslc.to_zarr(store_d, consolidated=False, encoding=encoding, mode=\"w\")" + ] + }, + { + "cell_type": "markdown", + "id": "34b1a12c-9640-4f8b-a5fc-2ade040b437c", + "metadata": {}, + "source": [ + "### Read Data Back" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a9c1bfc7-61d2-4a92-ab82-b026e7b9fcf6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 50.4 ms, sys: 7.21 ms, total: 57.6 ms\n", + "Wall time: 487 ms\n" + ] + } + ], + "source": [ + "%time dss = xr.open_dataset(store, consolidated=False, engine=\"zarr\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "feb23457-c6fe-4363-8393-c92ab1ae7a89", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 15.2 ms, sys: 671 μs, total: 15.9 ms\n", + "Wall time: 97.4 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array(0.00710905, dtype=float32)" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time dss.PV[0, 0, 0, 0].values" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "d5103624-554c-4d18-a323-d24f82b99818", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8.6 s, sys: 1.53 s, total: 10.1 s\n", + "Wall time: 22.6 s\n" + ] + } + ], + "source": [ + "%time _ = dss.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d302b787-3279-4564-a29a-5be82c82dd5d", + "metadata": {}, + "outputs": [], + "source": [ + "dssd = xr.open_dataset(store, consolidated=False, engine=\"zarr\").chunk({\"time\": 1, \"level\": 10})" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "ab01b1f7-42ff-41cf-aac6-c2c93f968227", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 4.55 sms\n" + ] + } + ], + "source": [ + "with ProgressBar():\n", + " _ = dssd.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "2482b40b-3ae9-45eb-8e26-61bf3b41d89e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "946.755253" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "1893510506 / 2 / 1e6" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "7c0a855f-5173-46f8-b296-d20c582be1cd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Name/
Typezarr.hierarchy.Group
Read-onlyTrue
Store typezarr.storage.FSStore
No. members6
No. arrays6
No. groups0
ArraysPV, latitude, level, longitude, time, utc_date
" + ], + "text/plain": [ + "Name : /\n", + "Type : zarr.hierarchy.Group\n", + "Read-only : True\n", + "Store type : zarr.storage.FSStore\n", + "No. members : 6\n", + "No. arrays : 6\n", + "No. groups : 0\n", + "Arrays : PV, latitude, level, longitude, time, utc_date" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "group = zarr.open_group(store, mode=\"r\")\n", + "group.info" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "3238d58d-1866-4467-ab35-18fd97e80b0b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Name/PV
Typezarr.core.Array
Data typefloat32
Shape(24, 37, 721, 1440)
Chunk shape(1, 1, 721, 1440)
OrderC
Read-onlyTrue
CompressorZstd(level=1)
Store typezarr.storage.FSStore
No. bytes3687828480 (3.4G)
No. bytes stored1893510506 (1.8G)
Storage ratio1.9
Chunks initialized888/888
" + ], + "text/plain": [ + "Name : /PV\n", + "Type : zarr.core.Array\n", + "Data type : float32\n", + "Shape : (24, 37, 721, 1440)\n", + "Chunk shape : (1, 1, 721, 1440)\n", + "Order : C\n", + "Read-only : True\n", + "Compressor : Zstd(level=1)\n", + "Store type : zarr.storage.FSStore\n", + "No. bytes : 3687828480 (3.4G)\n", + "No. bytes stored : 1893510506 (1.8G)\n", + "Storage ratio : 1.9\n", + "Chunks initialized : 888/888" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "group.PV.info" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "eade4790-3056-4c6d-a81c-8f85837d349d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/icechunk-python/notebooks/performance/era5_xarray-zarr3.ipynb b/icechunk-python/notebooks/performance/era5_xarray-zarr3.ipynb new file mode 100644 index 00000000..c4269d7a --- /dev/null +++ b/icechunk-python/notebooks/performance/era5_xarray-zarr3.ipynb @@ -0,0 +1,867 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "40c929a3-87d4-4c0e-a97d-1300d8adcae0", + "metadata": {}, + "source": [ + "# Icechunk Performance - Zarr V3\n", + "\n", + "Using data from the [NCAR ERA5 AWS Public Dataset](https://nsf-ncar-era5.s3.amazonaws.com/index.html)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b2904d5f-090b-4344-a2f7-99096ba26d27", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "xarray: 0.9.7.dev3734+g26081d4f\n", + "dask: 2024.9.1+8.g70f56e28\n", + "zarr: 3.0.0b1.dev8+g9bbfd88\n" + ] + } + ], + "source": [ + "import xarray as xr\n", + "import zarr\n", + "import dask\n", + "import fsspec\n", + "from dask.diagnostics import ProgressBar\n", + "\n", + "print('xarray: ', xr.__version__)\n", + "print('dask: ', dask.__version__)\n", + "print('zarr: ', zarr.__version__)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "05661b12-9714-4a77-9f33-e351b229895f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "zarr.config.set(\n", + " {\n", + " 'threading.max_workers': 16,\n", + " 'async.concurrency': 128\n", + " }\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "081e1a71-873e-45c3-b77d-5b7aa1617286", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 277 ms, sys: 37.5 ms, total: 315 ms\n", + "Wall time: 2.33 s\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/srv/conda/envs/icechunk/lib/python3.12/site-packages/xarray/backends/api.py:357: UserWarning: The specified chunks separate the stored chunks along dimension \"time\" starting at index 1. This could degrade performance. Instead, consider rechunking after loading.\n", + " var_chunks = _get_chunk(var, chunks, chunkmanager)\n" + ] + } + ], + "source": [ + "url = \"https://nsf-ncar-era5.s3.amazonaws.com/e5.oper.an.pl/194106/e5.oper.an.pl.128_060_pv.ll025sc.1941060100_1941060123.nc\"\n", + "%time ds = xr.open_dataset(fsspec.open(url).open(), engine=\"h5netcdf\", chunks={\"time\": 1})\n", + "ds = ds.drop_encoding()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b3048527-c50f-451c-9500-cac6c22dd1bc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Size: 4GB\n", + "Dimensions: (time: 24, level: 37, latitude: 721, longitude: 1440)\n", + "Coordinates:\n", + " * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", + " * level (level) float64 296B 1.0 2.0 3.0 5.0 ... 925.0 950.0 975.0 1e+03\n", + " * longitude (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8\n", + " * time (time) datetime64[ns] 192B 1941-06-01 ... 1941-06-01T23:00:00\n", + "Data variables:\n", + " PV (time, level, latitude, longitude) float32 4GB dask.array\n", + " utc_date (time) int32 96B dask.array\n", + "Attributes:\n", + " DATA_SOURCE: ECMWF: https://cds.climate.copernicus.eu, Copernicu...\n", + " NETCDF_CONVERSION: CISL RDA: Conversion from ECMWF GRIB 1 data to netC...\n", + " NETCDF_VERSION: 4.8.1\n", + " CONVERSION_PLATFORM: Linux r1i4n4 4.12.14-95.51-default #1 SMP Fri Apr 1...\n", + " CONVERSION_DATE: Wed May 10 06:33:49 MDT 2023\n", + " Conventions: CF-1.6\n", + " NETCDF_COMPRESSION: NCO: Precision-preserving compression to netCDF4/HD...\n", + " history: Wed May 10 06:34:19 2023: ncks -4 --ppc default=7 e...\n", + " NCO: netCDF Operators version 5.0.3 (Homepage = http://n...\n" + ] + } + ], + "source": [ + "print(ds)" + ] + }, + { + "cell_type": "markdown", + "id": "7f4a801c-b570-45e3-b37f-2e140a2fb273", + "metadata": {}, + "source": [ + "### Load Data from HDF5 File\n", + "\n", + "This illustrates how loading directly from HDF5 files on S3 can be slow, even with Dask." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "29e344c6-a25e-4342-979f-d2d2c7aed7a7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 62.20 ss\n" + ] + } + ], + "source": [ + "with ProgressBar():\n", + " dsl = ds.load()" + ] + }, + { + "cell_type": "markdown", + "id": "bdbd3f6c-e62c-4cfc-8cfb-b0fa22b6bddd", + "metadata": {}, + "source": [ + "### Write Zarr Store - No Dask" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "67c6389d-79a0-4992-b845-6a633cb4d86b", + "metadata": {}, + "outputs": [], + "source": [ + "encoding = {\n", + " \"PV\": {\n", + " \"codecs\": [zarr.codecs.BytesCodec(), zarr.codecs.ZstdCodec()],\n", + " \"chunks\": (1, 1, 721, 1440)\n", + " }\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ece4f559-6ed5-4027-bab4-6ee42babf103", + "metadata": {}, + "outputs": [], + "source": [ + "import s3fs\n", + "s3 = s3fs.S3FileSystem(use_listings_cache=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "657f10fe-b29e-4ef5-8953-ce11374ce818", + "metadata": {}, + "outputs": [], + "source": [ + "target_path = \"icechunk-test/ryan/zarr-v3/test-era5-v3-919\"\n", + "store = zarr.storage.RemoteStore(s3, mode=\"w\", path=target_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "b9a8c5ab-cc5a-4a05-b4ba-3b52be187e18", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 36.2 s, sys: 2.53 s, total: 38.7 s\n", + "Wall time: 15.8 s\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time dsl.to_zarr(store, consolidated=False, zarr_format=3, encoding=encoding, mode=\"w\")" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "3718012b-3157-47a1-8ac4-f72d27a2132f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 12.60 s\n" + ] + } + ], + "source": [ + "# with dask\n", + "dslc = dsl.chunk({\"time\": 1, \"level\": 1})\n", + "store_d = zarr.storage.RemoteStore(s3, mode=\"w\", path=target_url + \"-dask\")\n", + "with ProgressBar():\n", + " dslc.to_zarr(store_d, consolidated=False, zarr_format=3, encoding=encoding, mode=\"w\")" + ] + }, + { + "cell_type": "markdown", + "id": "34b1a12c-9640-4f8b-a5fc-2ade040b437c", + "metadata": {}, + "source": [ + "### Read Data Back" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a9c1bfc7-61d2-4a92-ab82-b026e7b9fcf6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 35.6 ms, sys: 0 ns, total: 35.6 ms\n", + "Wall time: 343 ms\n" + ] + } + ], + "source": [ + "#store = zarr.storage.RemoteStore(s3, mode=\"r\", path=target_url)\n", + "%time dss = xr.open_dataset(store, consolidated=False, zarr_format=3, engine=\"zarr\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "752983c4-7e76-4530-8b7b-73b6bb5e2600", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset> Size: 4GB\n",
+       "Dimensions:    (time: 24, level: 37, latitude: 721, longitude: 1440)\n",
+       "Coordinates:\n",
+       "  * latitude   (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n",
+       "  * level      (level) float64 296B 1.0 2.0 3.0 5.0 ... 925.0 950.0 975.0 1e+03\n",
+       "  * longitude  (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8\n",
+       "  * time       (time) datetime64[ns] 192B 1941-06-01 ... 1941-06-01T23:00:00\n",
+       "Data variables:\n",
+       "    PV         (time, level, latitude, longitude) float32 4GB ...\n",
+       "    utc_date   (time) int32 96B ...\n",
+       "Attributes:\n",
+       "    DATA_SOURCE:          ECMWF: https://cds.climate.copernicus.eu, Copernicu...\n",
+       "    NETCDF_CONVERSION:    CISL RDA: Conversion from ECMWF GRIB 1 data to netC...\n",
+       "    NETCDF_VERSION:       4.8.1\n",
+       "    CONVERSION_PLATFORM:  Linux r1i4n4 4.12.14-95.51-default #1 SMP Fri Apr 1...\n",
+       "    CONVERSION_DATE:      Wed May 10 06:33:49 MDT 2023\n",
+       "    Conventions:          CF-1.6\n",
+       "    NETCDF_COMPRESSION:   NCO: Precision-preserving compression to netCDF4/HD...\n",
+       "    history:              Wed May 10 06:34:19 2023: ncks -4 --ppc default=7 e...\n",
+       "    NCO:                  netCDF Operators version 5.0.3 (Homepage = http://n...
" + ], + "text/plain": [ + " Size: 4GB\n", + "Dimensions: (time: 24, level: 37, latitude: 721, longitude: 1440)\n", + "Coordinates:\n", + " * latitude (latitude) float64 6kB 90.0 89.75 89.5 ... -89.5 -89.75 -90.0\n", + " * level (level) float64 296B 1.0 2.0 3.0 5.0 ... 925.0 950.0 975.0 1e+03\n", + " * longitude (longitude) float64 12kB 0.0 0.25 0.5 0.75 ... 359.2 359.5 359.8\n", + " * time (time) datetime64[ns] 192B 1941-06-01 ... 1941-06-01T23:00:00\n", + "Data variables:\n", + " PV (time, level, latitude, longitude) float32 4GB ...\n", + " utc_date (time) int32 96B ...\n", + "Attributes:\n", + " DATA_SOURCE: ECMWF: https://cds.climate.copernicus.eu, Copernicu...\n", + " NETCDF_CONVERSION: CISL RDA: Conversion from ECMWF GRIB 1 data to netC...\n", + " NETCDF_VERSION: 4.8.1\n", + " CONVERSION_PLATFORM: Linux r1i4n4 4.12.14-95.51-default #1 SMP Fri Apr 1...\n", + " CONVERSION_DATE: Wed May 10 06:33:49 MDT 2023\n", + " Conventions: CF-1.6\n", + " NETCDF_COMPRESSION: NCO: Precision-preserving compression to netCDF4/HD...\n", + " history: Wed May 10 06:34:19 2023: ncks -4 --ppc default=7 e...\n", + " NCO: netCDF Operators version 5.0.3 (Homepage = http://n..." + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dss" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "feb23457-c6fe-4363-8393-c92ab1ae7a89", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 15.7 ms, sys: 0 ns, total: 15.7 ms\n", + "Wall time: 101 ms\n" + ] + }, + { + "data": { + "text/plain": [ + "array(0.00710905, dtype=float32)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%time dss.PV[0, 0, 0, 0].values" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "d5103624-554c-4d18-a323-d24f82b99818", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 8.41 s, sys: 1.19 s, total: 9.6 s\n", + "Wall time: 5.11 s\n" + ] + } + ], + "source": [ + "%time _ = dss.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d302b787-3279-4564-a29a-5be82c82dd5d", + "metadata": {}, + "outputs": [], + "source": [ + "dssd = xr.open_dataset(store, consolidated=False, engine=\"zarr\").chunk({\"time\": 1, \"level\": 10})" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "ab01b1f7-42ff-41cf-aac6-c2c93f968227", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[########################################] | 100% Completed | 6.26 sms\n" + ] + } + ], + "source": [ + "with ProgressBar():\n", + " _ = dssd.compute()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3dfd18bd-c885-4103-a156-ef9185d9d461", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:icechunk]", + "language": "python", + "name": "conda-env-icechunk-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/icechunk-python/notebooks/reference.ipynb b/icechunk-python/notebooks/reference.ipynb deleted file mode 100644 index 094e721f..00000000 --- a/icechunk-python/notebooks/reference.ipynb +++ /dev/null @@ -1,132 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import zarr" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Ok! Lets create some data!" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Group(_async_group=)" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "group = zarr.group(overwrite=True)\n", - "store = group.store_path.store\n", - "group" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "air_temp = group.create_array(\"air_temp\", shape=(1000, 1000), chunk_shape=(100, 100), dtype=\"i4\")\n", - "air_temp" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "zarr.json\n", - "air_temp/zarr.json\n" - ] - } - ], - "source": [ - "async for key in store.list():\n", - " print(key)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "air_temp[:, :] = 42" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array(42, dtype=int32)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "air_temp[200, 6]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": ".venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/icechunk-python/notebooks/version-control.ipynb b/icechunk-python/notebooks/version-control.ipynb index 3cfe1169..d77f323c 100644 --- a/icechunk-python/notebooks/version-control.ipynb +++ b/icechunk-python/notebooks/version-control.ipynb @@ -10,15 +10,13 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "020e322c-4323-4064-b17e-a1e95f710d21", "metadata": {}, "outputs": [], "source": [ - "\n", "import zarr\n", - "\n", - "from icechunk import IcechunkStore, Storage" + "from icechunk import IcechunkStore, StorageConfig" ] }, { @@ -33,26 +31,24 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "dd35041c-7981-446a-8981-d1eae02f4fff", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "" + "" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "\n", - "store = await IcechunkStore.create(\n", - " storage=Storage.memory(\"test\"),\n", - " mode=\"w\",\n", + "store = IcechunkStore.create(\n", + " storage=StorageConfig.memory(\"test\")\n", ")\n", "store" ] @@ -90,7 +86,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "51654a0d-58b2-43a9-acd9-0214f22c3dc5", "metadata": {}, "outputs": [], @@ -100,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "d8bf3160-2a39-48be-82ea-e800fd3164b3", "metadata": {}, "outputs": [], @@ -110,37 +106,29 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "id": "3a33f69c-9949-458a-9d3a-1f0d7f451553", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[icechunk/src/storage/caching.rs:190:9] \"inserting\" = \"inserting\"\n", - "[icechunk/src/storage/caching.rs:190:9] &id = 78447f5713395150be2281b3254cded2\n" - ] - }, { "data": { "text/plain": [ - "'AYHJX8N6C308R5J57CC8CX6N20'" + "'51MXCR5RTNGPC54Z7WJG'" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "first_commit = await store.commit(\"first commit\")\n", + "first_commit = store.commit(\"first commit\")\n", "first_commit" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "id": "f41f701e-a513-4fe6-b23e-82200f5ab221", "metadata": {}, "outputs": [ @@ -150,7 +138,7 @@ "{'attr': 'first_attr'}" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -161,32 +149,24 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "id": "10e40f91-7f90-4feb-91ba-b51b709d508d", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[icechunk/src/storage/caching.rs:190:9] \"inserting\" = \"inserting\"\n", - "[icechunk/src/storage/caching.rs:190:9] &id = 0e808d75aab5b5cf1bf33732244ef431\n" - ] - }, { "data": { "text/plain": [ - "'Y3YFWYFMW3RRMZP04M9STWKANR'" + "'45AE3AT46RHZCZ50HWEG'" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "root_group.attrs[\"attr\"] = \"second_attr\"\n", - "second_commit = await store.commit(\"second commit\")\n", + "second_commit = store.commit(\"second commit\")\n", "second_commit" ] }, @@ -200,17 +180,17 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "id": "34fff29b-2bec-490c-89ef-51e14fb4527f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'Y3YFWYFMW3RRMZP04M9STWKANR'" + "'45AE3AT46RHZCZ50HWEG'" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -237,17 +217,17 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "id": "ff66cc99-84ca-4371-b63d-12efa6e98dc3", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "('Y3YFWYFMW3RRMZP04M9STWKANR', {'attr': 'second_attr'})" + "('45AE3AT46RHZCZ50HWEG', {'attr': 'second_attr'})" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -258,7 +238,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "id": "e785d9a1-36ec-4207-b334-20e0a68e3ac8", "metadata": {}, "outputs": [ @@ -268,13 +248,13 @@ "{'attr': 'first_attr'}" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "await store.checkout(snapshot_id=first_commit)\n", + "store.checkout(snapshot_id=first_commit)\n", "root_group = zarr.group(store=store)\n", "dict(root_group.attrs)" ] @@ -291,7 +271,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "257215f2-fa09-4730-a1da-07a4d3d12b0c", "metadata": {}, "outputs": [ @@ -302,15 +282,15 @@ "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[12], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m root_group\u001b[38;5;241m.\u001b[39mattrs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattr\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwill_fail\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m store\u001b[38;5;241m.\u001b[39mcommit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthis should fail\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/repos/icechunk/icechunk-python/python/icechunk/__init__.py:60\u001b[0m, in \u001b[0;36mIcechunkStore.commit\u001b[0;34m(self, message)\u001b[0m\n\u001b[1;32m 59\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcommit\u001b[39m(\u001b[38;5;28mself\u001b[39m, message: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mstr\u001b[39m:\n\u001b[0;32m---> 60\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_store\u001b[38;5;241m.\u001b[39mcommit(message)\n", + "Cell \u001b[0;32mIn[11], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m root_group\u001b[38;5;241m.\u001b[39mattrs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattr\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mwill_fail\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mawait\u001b[39;00m store\u001b[38;5;241m.\u001b[39mcommit(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthis should fail\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/Developer/icechunk/icechunk-python/python/icechunk/__init__.py:261\u001b[0m, in \u001b[0;36mIcechunkStore.commit\u001b[0;34m(self, message)\u001b[0m\n\u001b[1;32m 255\u001b[0m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mcommit\u001b[39m(\u001b[38;5;28mself\u001b[39m, message: \u001b[38;5;28mstr\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mstr\u001b[39m:\n\u001b[1;32m 256\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Commit any uncommitted changes to the store.\u001b[39;00m\n\u001b[1;32m 257\u001b[0m \n\u001b[1;32m 258\u001b[0m \u001b[38;5;124;03m This will create a new snapshot on the current branch and return\u001b[39;00m\n\u001b[1;32m 259\u001b[0m \u001b[38;5;124;03m the snapshot id.\u001b[39;00m\n\u001b[1;32m 260\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 261\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mawait\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_store\u001b[38;5;241m.\u001b[39mcommit(message)\n", "\u001b[0;31mValueError\u001b[0m: store error: all commits must be made on a branch" ] } ], "source": [ "root_group.attrs[\"attr\"] = \"will_fail\"\n", - "await store.commit(\"this should fail\")" + "store.commit(\"this should fail\")" ] }, { @@ -334,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "id": "4ccb3c84-787a-43c4-b9af-606e6b8212ed", "metadata": {}, "outputs": [], @@ -354,17 +334,17 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "id": "81f676de-48f7-4dd1-bbf9-300f97700f32", "metadata": {}, "outputs": [], "source": [ - "await store.reset()" + "store.reset()" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "id": "94b4e4d8-767a-45d0-9f4f-b0a473e9520a", "metadata": {}, "outputs": [], @@ -374,28 +354,28 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "id": "b35947c7-e634-41e7-a78e-89447a5f4f8e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'AYHJX8N6C308R5J57CC8CX6N20'" + "'51MXCR5RTNGPC54Z7WJG'" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "await store.new_branch(\"new-branch\")" + "store.new_branch(\"new-branch\")" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "id": "121d2b55-9311-4f1a-813a-c6c49bbc4a4f", "metadata": {}, "outputs": [ @@ -405,7 +385,7 @@ "'new-branch'" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -416,7 +396,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 17, "id": "799cfce7-7385-4ae6-8868-77d4789c5cdb", "metadata": {}, "outputs": [ @@ -426,7 +406,7 @@ "{'attr': 'first_attr'}" ] }, - "execution_count": 18, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -438,22 +418,13 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 18, "id": "c3484aba-d25b-4d26-aa59-714e1f236d24", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "[icechunk/src/storage/caching.rs:190:9] \"inserting\" = \"inserting\"\n", - "[icechunk/src/storage/caching.rs:190:9] &id = af76e505cac3a24c91e6ac817335ee40\n" - ] - } - ], + "outputs": [], "source": [ "root_group.attrs[\"attr\"] = \"new_branch_attr\"\n", - "new_branch_commit = await store.commit(\"commit on new branch\")" + "new_branch_commit = store.commit(\"commit on new branch\")" ] }, { @@ -468,17 +439,17 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 19, "id": "5ed7e6ed-47db-4542-b773-6ab128a10395", "metadata": {}, "outputs": [], "source": [ - "await store.checkout(branch=\"main\")" + "store.checkout(branch=\"main\")" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "id": "b495cfbf-3f82-4f8c-9943-119e6a69dafb", "metadata": {}, "outputs": [ @@ -488,7 +459,7 @@ "True" ] }, - "execution_count": 22, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -499,7 +470,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 21, "id": "c80da99a-a78f-419f-a92d-bcf770c0db53", "metadata": {}, "outputs": [ @@ -509,13 +480,13 @@ "True" ] }, - "execution_count": 24, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "await store.checkout(branch=\"new-branch\")\n", + "store.checkout(branch=\"new-branch\")\n", "store.snapshot_id == new_branch_commit" ] }, @@ -537,12 +508,12 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 22, "id": "47a4a3a2-0ae2-4e93-a1a6-a2d4706339db", "metadata": {}, "outputs": [], "source": [ - "await store.checkout(branch=\"main\")" + "store.checkout(branch=\"main\")" ] }, { @@ -555,22 +526,22 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 23, "id": "6366cb69-cb02-4b61-9607-dc4b0ba08517", "metadata": {}, "outputs": [], "source": [ - "await store.tag(\"v0\", snapshot_id=store.snapshot_id)" + "store.tag(\"v0\", snapshot_id=store.snapshot_id)" ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 24, "id": "81da441c-ccab-43c2-b50e-0588fb4c91bc", "metadata": {}, "outputs": [], "source": [ - "await store.tag(\"v-1\", snapshot_id=first_commit)" + "store.tag(\"v-1\", snapshot_id=first_commit)" ] }, { @@ -585,7 +556,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": 25, "id": "dc509bde-2510-48f1-90b0-69a065393ced", "metadata": {}, "outputs": [ @@ -595,24 +566,24 @@ "True" ] }, - "execution_count": 31, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "await store.checkout(tag=\"v-1\")\n", + "store.checkout(tag=\"v-1\")\n", "store.snapshot_id == first_commit" ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": 26, "id": "b8b221c1-d94d-4971-97b9-ffa1948ce93d", "metadata": {}, "outputs": [], "source": [ - "await store.checkout(branch=\"main\")" + "store.checkout(branch=\"main\")" ] } ], diff --git a/icechunk-python/pyproject.toml b/icechunk-python/pyproject.toml index 950d1504..efef2fa8 100644 --- a/icechunk-python/pyproject.toml +++ b/icechunk-python/pyproject.toml @@ -4,28 +4,40 @@ build-backend = "maturin" [project] name = "icechunk" -requires-python = ">=3.10" +requires-python = ">=3.11" classifiers = [ - "Programming Language :: Rust", - "Programming Language :: Python :: Implementation :: CPython", - "Programming Language :: Python :: Implementation :: PyPy", + "Programming Language :: Rust", + "Programming Language :: Python :: Implementation :: CPython", + "Programming Language :: Python :: Implementation :: PyPy", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] +license = { text = "Apache-2.0" } dynamic = ["version"] -dependencies = [ - "zarr==3.0.0a5" -] +dependencies = ["zarr==3.0.0b1"] + +[tool.poetry] +name = "icechunk" +version = "0.1.0-alpha.3" +description = "Icechunk Python" +authors = ["Earthmover "] +readme = "README.md" +packages = [{ include = "icechunk", from = "python" }] [project.optional-dependencies] test = [ - "coverage", - "mypy", - "object-store-python", - "pytest", - "pytest-cov", - "pytest-asyncio", - "hypothesis", - "ruff", + "coverage", + "mypy", + "object-store-python", + "pytest", + "pytest-cov", + "pytest-asyncio", + "ruff", + "dask", + "distributed", + "hypothesis", ] [tool.maturin] @@ -35,7 +47,30 @@ python-source = "python" [tool.pytest.ini_options] asyncio_mode = "auto" +minversion = "7" +testpaths = ["tests"] +log_cli_level = "INFO" +xfail_strict = true +addopts = ["-ra", "--strict-config", "--strict-markers"] +filterwarnings = ["error"] [tool.pyright] venvPath = "." venv = ".venv" + +[tool.mypy] +python_version = "3.11" +strict = true +warn_unreachable = true +enable_error_code = ["ignore-without-code", "redundant-expr", "truthy-bool"] + +[tool.ruff] +line-length = 90 +exclude = ["*.ipynb"] + +[tool.ruff.lint] +extend-select = [ + "B", # flake8-bugbear + "I", # isort + "UP", # pypupgrade +] diff --git a/icechunk-python/python/icechunk/__init__.py b/icechunk-python/python/icechunk/__init__.py index a00643df..f88e89a4 100644 --- a/icechunk-python/python/icechunk/__init__.py +++ b/icechunk-python/python/icechunk/__init__.py @@ -1,24 +1,36 @@ # module -import json -from typing import Any, AsyncGenerator, Self +from collections.abc import AsyncGenerator, Iterable +from typing import Any, Self + +from zarr.abc.store import ByteRangeRequest, Store +from zarr.core.buffer import Buffer, BufferPrototype +from zarr.core.common import AccessModeLiteral, BytesLike +from zarr.core.sync import SyncMixin + from ._icechunk_python import ( + KeyNotFound, PyIcechunkStore, S3Credentials, - pyicechunk_store_create, - pyicechunk_store_from_json_config, SnapshotMetadata, - Storage, + StorageConfig, StoreConfig, - pyicechunk_store_open_existing, + VirtualRefConfig, + __version__, + pyicechunk_store_create, pyicechunk_store_exists, + pyicechunk_store_from_bytes, + pyicechunk_store_open_existing, ) -from zarr.abc.store import AccessMode, Store -from zarr.core.buffer import Buffer, BufferPrototype -from zarr.core.common import AccessModeLiteral, BytesLike -from zarr.core.sync import SyncMixin - -__all__ = ["IcechunkStore", "Storage", "S3Credentials", "StoreConfig"] +__all__ = [ + "__version__", + "IcechunkStore", + "StorageConfig", + "S3Credentials", + "SnapshotMetadata", + "StoreConfig", + "VirtualRefConfig", +] class IcechunkStore(Store, SyncMixin): @@ -26,16 +38,19 @@ class IcechunkStore(Store, SyncMixin): @classmethod async def open(cls, *args: Any, **kwargs: Any) -> Self: - """FIXME: Better handle the open method based on the access mode the user passed in along with the kwargs - https://github.com/zarr-developers/zarr-python/blob/c878da2a900fc621ff23cc6d84d45cd3cb26cbed/src/zarr/abc/store.py#L24-L30 + """This method is called by zarr-python, it's not intended for users. + + Use one of `IcechunkStore.open_existing`, `IcechunkStore.create` or `IcechunkStore.open_or_create` instead. """ + return cls.open_or_create(*args, **kwargs) + + @classmethod + def open_or_create(cls, *args: Any, **kwargs: Any) -> Self: if "mode" in kwargs: mode = kwargs.pop("mode") else: mode = "r" - access_mode = AccessMode.from_literal(mode) - if "storage" in kwargs: storage = kwargs.pop("storage") else: @@ -43,28 +58,35 @@ async def open(cls, *args: Any, **kwargs: Any) -> Self: "Storage configuration is required. Pass a Storage object to construct an IcechunkStore" ) - store_exists = await pyicechunk_store_exists(storage) - - if access_mode.overwrite: - if store_exists: - raise ValueError( - "Store already exists and overwrite is not allowed for IcechunkStore" - ) - store = await cls.create(storage, mode, *args, **kwargs) - elif access_mode.create or access_mode.update: - if store_exists: - store = await cls.open_existing(storage, mode, *args, **kwargs) - else: - store = await cls.create(storage, mode, *args, **kwargs) - else: - store = await cls.open_existing(storage, mode, *args, **kwargs) - - # We dont want to call _open() becuase icechunk handles the opening, etc. + store = None + match mode: + case "r" | "r+": + store = cls.open_existing(storage, mode, *args, **kwargs) + case "a": + if pyicechunk_store_exists(storage): + store = cls.open_existing(storage, mode, *args, **kwargs) + else: + store = cls.create(storage, mode, *args, **kwargs) + case "w": + if pyicechunk_store_exists(storage): + store = cls.open_existing(storage, mode, *args, **kwargs) + store.sync_clear() + else: + store = cls.create(storage, mode, *args, **kwargs) + case "w-": + if pyicechunk_store_exists(storage): + raise ValueError("""Zarr store already exists, open using mode "w" or "r+""""") + else: + store = cls.create(storage, mode, *args, **kwargs) + + assert(store) + # We dont want to call _open() because icechunk handles the opening, etc. # if we have gotten this far we can mark it as open store._is_open = True return store + def __init__( self, store: PyIcechunkStore, @@ -72,8 +94,11 @@ def __init__( *args: Any, **kwargs: Any, ): - """Create a new IcechunkStore. This should not be called directly, instead use the create or open_existing class methods.""" - super().__init__(mode, *args, **kwargs) + """Create a new IcechunkStore. + + This should not be called directly, instead use the `create`, `open_existing` or `open_or_create` class methods. + """ + super().__init__(*args, mode=mode, **kwargs) if store is None: raise ValueError( "An IcechunkStore should not be created with the default constructor, instead use either the create or open_existing class methods." @@ -81,67 +106,11 @@ def __init__( self._store = store @classmethod - async def from_config( - cls, config: dict, mode: AccessModeLiteral = "r", *args: Any, **kwargs: Any - ) -> Self: - """Create an IcechunkStore from a given configuration. - - NOTE: This is deprecated and will be removed in a future release. Use the open_existing or create methods instead. - - The configuration should be a dictionary in the following format: - { - "storage": { - "type": "s3, // one of "in_memory", "local_filesystem", "s3", "cached" - "...": "additional storage configuration" - }, - "repository": { - // Optional, only required if you want to open an existing repository - "version": { - "branch": "main", - }, - // The threshold at which chunks are stored inline and not written to chunk storage - inline_chunk_threshold_bytes: 512, - }, - } - - The following storage types are supported: - - in_memory: store data in memory - - local_filesystem: store data on the local filesystem - - s3: store data on S3 compatible storage - - cached: store data in memory with a backing storage - - The following additional configuration options are supported for each storage type: - - in_memory: {} - - local_filesystem: {"root": "path/to/root/directory"} - - s3: { - "bucket": "bucket-name", - "prefix": "optional-prefix", - "endpoint": "optional-end - "access_key_id": "optional-access-key-id", - "secret_access_key": "optional", - "session_token": "optional", - "endpoint": "optional" - } - - cached: { - "approx_max_memory_bytes": 1_000_000, - "backend": { - "type": "s3", - "...": "additional storage configuration" - } - } - - If opened with AccessModeLiteral "r", the store will be read-only. Otherwise the store will be writable. - """ - config_str = json.dumps(config) - read_only = mode == "r" - store = await pyicechunk_store_from_json_config(config_str, read_only=read_only) - return cls(store=store, mode=mode, args=args, kwargs=kwargs) - - @classmethod - async def open_existing( + def open_existing( cls, - storage: Storage, + storage: StorageConfig, mode: AccessModeLiteral = "r", + config: StoreConfig | None = None, *args: Any, **kwargs: Any, ) -> Self: @@ -155,105 +124,329 @@ async def open_existing( If opened with AccessModeLiteral "r", the store will be read-only. Otherwise the store will be writable. """ + config = config or StoreConfig() read_only = mode == "r" - store = await pyicechunk_store_open_existing(storage, read_only=read_only) + # We have delayed checking if the repository exists, to avoid the delay in the happy case + # So we need to check now if open fails, to provide a nice error message + try: + store = pyicechunk_store_open_existing( + storage, read_only=read_only, config=config + ) + # TODO: we should have an exception type to catch here, for the case of non-existing repo + except Exception as e: + if pyicechunk_store_exists(storage): + # if the repo exists, this is an actual error we need to raise + raise e + else: + # if the repo doesn't exists, we want to point users to that issue instead + raise ValueError("No Icechunk repository at the provided location, try opening in create mode or changing the location") from None return cls(store=store, mode=mode, args=args, kwargs=kwargs) @classmethod - async def create( + def create( cls, - storage: Storage, + storage: StorageConfig, mode: AccessModeLiteral = "w", + config: StoreConfig | None = None, *args: Any, **kwargs: Any, ) -> Self: """Create a new IcechunkStore with the given storage configuration. If a store already exists at the given location, an error will be raised. - - It is recommended to use the cached storage option for better performance. If cached=True, - this will be configured automatically with the provided storage_config as the underlying - storage backend. """ - store = await pyicechunk_store_create(storage) + config = config or StoreConfig() + store = pyicechunk_store_create(storage, config=config) return cls(store=store, mode=mode, args=args, kwargs=kwargs) + def with_mode(self, mode: AccessModeLiteral) -> Self: + """ + Return a new store of the same type pointing to the same location with a new mode. + + The returned Store is not automatically opened. Call :meth:`Store.open` before + using. + + Parameters + ---------- + mode: AccessModeLiteral + The new mode to use. + + Returns + ------- + store: + A new store of the same type with the new mode. + + """ + read_only = mode == "r" + new_store = self._store.with_mode(read_only) + return self.__class__(new_store, mode=mode) + + def __eq__(self, value: object) -> bool: + if not isinstance(value, self.__class__): + return False + return self._store == value._store + + def __getstate__(self) -> object: + # we serialize the Rust store as bytes + d = self.__dict__.copy() + d["_store"] = self._store.as_bytes() + return d + + def __setstate__(self, state: Any) -> None: + # we have to deserialize the bytes of the Rust store + mode = state["_mode"] + is_read_only = mode.readonly + store_repr = state["_store"] + state["_store"] = pyicechunk_store_from_bytes(store_repr, is_read_only) + self.__dict__ = state + @property def snapshot_id(self) -> str: """Return the current snapshot id.""" return self._store.snapshot_id + def change_set_bytes(self) -> bytes: + """Get the complete list of changes applied in this session, serialized to bytes. + + This method is useful in combination with `IcechunkStore.distributed_commit`. When a + write session is too large to execute in a single machine, it could be useful to + distribute it across multiple workers. Each worker can write their changes independently + (map) and then a single commit is executed by a coordinator (reduce). + + This methods provides a way to send back to gather a "description" of the + changes applied by a worker. Resulting bytes, together with the `change_set_bytes` of + other workers, can be fed to `distributed_commit`. + + This API is subject to change, it will be replaced by a merge operation at the Store level. + """ + return self._store.change_set_bytes() + @property def branch(self) -> str | None: """Return the current branch name.""" return self._store.branch - async def checkout( + def checkout( self, snapshot_id: str | None = None, branch: str | None = None, tag: str | None = None, ) -> None: - """Checkout a branch, tag, or specific snapshot.""" + """Checkout a branch, tag, or specific snapshot. + + If a branch is checked out, any following `commit` attempts will update that branch + reference if successful. If a tag or snapshot_id are checked out, the repository + won't allow commits. + """ if snapshot_id is not None: if branch is not None or tag is not None: raise ValueError( "only one of snapshot_id, branch, or tag may be specified" ) - return await self._store.checkout_snapshot(snapshot_id) + return self._store.checkout_snapshot(snapshot_id) if branch is not None: if tag is not None: raise ValueError( "only one of snapshot_id, branch, or tag may be specified" ) - return await self._store.checkout_branch(branch) + return self._store.checkout_branch(branch) if tag is not None: - return await self._store.checkout_tag(tag) + return self._store.checkout_tag(tag) raise ValueError("a snapshot_id, branch, or tag must be specified") - async def commit(self, message: str) -> str: + async def async_checkout( + self, + snapshot_id: str | None = None, + branch: str | None = None, + tag: str | None = None, + ) -> None: + """Checkout a branch, tag, or specific snapshot. + + If a branch is checked out, any following `commit` attempts will update that branch + reference if successful. If a tag or snapshot_id are checked out, the repository + won't allow commits. + """ + if snapshot_id is not None: + if branch is not None or tag is not None: + raise ValueError( + "only one of snapshot_id, branch, or tag may be specified" + ) + return await self._store.async_checkout_snapshot(snapshot_id) + if branch is not None: + if tag is not None: + raise ValueError( + "only one of snapshot_id, branch, or tag may be specified" + ) + return await self._store.async_checkout_branch(branch) + if tag is not None: + return await self._store.async_checkout_tag(tag) + + raise ValueError("a snapshot_id, branch, or tag must be specified") + + def commit(self, message: str) -> str: + """Commit any uncommitted changes to the store. + + This will create a new snapshot on the current branch and return + the new snapshot id. + + This method will fail if: + + * there is no currently checked out branch + * some other writer updated the current branch since the repository was checked out + """ + return self._store.commit(message) + + async def async_commit(self, message: str) -> str: """Commit any uncommitted changes to the store. This will create a new snapshot on the current branch and return - the snapshot id. + the new snapshot id. + + This method will fail if: + + * there is no currently checked out branch + * some other writer updated the current branch since the repository was checked out + """ + return await self._store.async_commit(message) + + def distributed_commit( + self, message: str, other_change_set_bytes: list[bytes] + ) -> str: + """Commit any uncommitted changes to the store with a set of distributed changes. + + This will create a new snapshot on the current branch and return + the new snapshot id. + + This method will fail if: + + * there is no currently checked out branch + * some other writer updated the current branch since the repository was checked out + + other_change_set_bytes must be generated as the output of calling `change_set_bytes` + on other stores. The resulting commit will include changes from all stores. + + The behavior is undefined if the stores applied conflicting changes. + """ + return self._store.distributed_commit(message, other_change_set_bytes) + + async def async_distributed_commit( + self, message: str, other_change_set_bytes: list[bytes] + ) -> str: + """Commit any uncommitted changes to the store with a set of distributed changes. + + This will create a new snapshot on the current branch and return + the new snapshot id. + + This method will fail if: + + * there is no currently checked out branch + * some other writer updated the current branch since the repository was checked out + + other_change_set_bytes must be generated as the output of calling `change_set_bytes` + on other stores. The resulting commit will include changes from all stores. + + The behavior is undefined if the stores applied conflicting changes. """ - return await self._store.commit(message) + return await self._store.async_distributed_commit(message, other_change_set_bytes) @property def has_uncommitted_changes(self) -> bool: """Return True if there are uncommitted changes to the store""" return self._store.has_uncommitted_changes - async def reset(self) -> None: + async def async_reset(self) -> None: + """Discard any uncommitted changes and reset to the previous snapshot state.""" + return await self._store.async_reset() + + def reset(self) -> None: """Discard any uncommitted changes and reset to the previous snapshot state.""" - return await self._store.reset() + return self._store.reset() + + async def async_new_branch(self, branch_name: str) -> str: + """Create a new branch pointing to the current checked out snapshot. + + This requires having no uncommitted changes. + """ + return await self._store.async_new_branch(branch_name) + + def new_branch(self, branch_name: str) -> str: + """Create a new branch pointing to the current checked out snapshot. + + This requires having no uncommitted changes. + """ + return self._store.new_branch(branch_name) + + async def async_reset_branch(self, to_snapshot: str) -> None: + """Reset the currently checked out branch to point to a different snapshot. + + This requires having no uncommitted changes. + + The snapshot id can be obtained as the result of a commit operation, but, more probably, + as the id of one of the SnapshotMetadata objects returned by `ancestry()` + + This operation edits the repository history; it must be executed carefully. + In particular, the current snapshot may end up being inaccessible from any + other branches or tags. + """ + return await self._store.async_reset_branch(to_snapshot) + + def reset_branch(self, to_snapshot: str) -> None: + """Reset the currently checked out branch to point to a different snapshot. - async def new_branch(self, branch_name: str) -> str: - """Create a new branch from the current snapshot. This requires having no uncommitted changes.""" - return await self._store.new_branch(branch_name) + This requires having no uncommitted changes. - async def tag(self, tag_name: str, snapshot_id: str) -> None: - """Tag an existing snapshot with a given name.""" - return await self._store.tag(tag_name, snapshot_id=snapshot_id) + The snapshot id can be obtained as the result of a commit operation, but, more probably, + as the id of one of the SnapshotMetadata objects returned by `ancestry()` - def ancestry(self) -> AsyncGenerator[SnapshotMetadata, None]: + This operation edits the repository history, it must be executed carefully. + In particular, the current snapshot may end up being inaccessible from any + other branches or tags. + """ + return self._store.reset_branch(to_snapshot) + + def tag(self, tag_name: str, snapshot_id: str) -> None: + """Create a tag pointing to the current checked out snapshot.""" + return self._store.tag(tag_name, snapshot_id=snapshot_id) + + async def async_tag(self, tag_name: str, snapshot_id: str) -> None: + """Create a tag pointing to the current checked out snapshot.""" + return await self._store.async_tag(tag_name, snapshot_id=snapshot_id) + + def ancestry(self) -> list[SnapshotMetadata]: + """Get the list of parents of the current version. + """ + return self._store.ancestry() + + def async_ancestry(self) -> AsyncGenerator[SnapshotMetadata, None]: """Get the list of parents of the current version. Returns ------- AsyncGenerator[SnapshotMetadata, None] """ - return self._store.ancestry() + return self._store.async_ancestry() async def empty(self) -> bool: """Check if the store is empty.""" return await self._store.empty() async def clear(self) -> None: - """Clear the store.""" + """Clear the store. + + This will remove all contents from the current session, + including all groups and all arrays. But it will not modify the repository history. + """ return await self._store.clear() + def sync_clear(self) -> None: + """Clear the store. + + This will remove all contents from the current session, + including all groups and all arrays. But it will not modify the repository history. + """ + return self._store.sync_clear() + async def get( self, key: str, @@ -271,11 +464,10 @@ async def get( ------- Buffer """ + try: result = await self._store.get(key, byte_range) - if result is None: - return None - except ValueError as _e: + except KeyNotFound as _e: # Zarr python expects None to be returned if the key does not exist # but an IcechunkStore returns an error if the key does not exist return None @@ -285,23 +477,23 @@ async def get( async def get_partial_values( self, prototype: BufferPrototype, - key_ranges: list[tuple[str, tuple[int | None, int | None]]], + key_ranges: Iterable[tuple[str, ByteRangeRequest]], ) -> list[Buffer | None]: """Retrieve possibly partial values from given key_ranges. Parameters ---------- - key_ranges : list[tuple[str, tuple[int, int]]] + key_ranges : Iterable[tuple[str, tuple[int | None, int | None]]] Ordered set of key, range pairs, a key may occur multiple times with different ranges Returns ------- list of values, in the order of the key_ranges, may contain null/none for missing keys """ - result = await self._store.get_partial_values(key_ranges) - return [ - prototype.buffer.from_bytes(r) if r is not None else None for r in result - ] + # NOTE: pyo3 has not implicit conversion from an Iterable to a rust iterable. So we convert it + # to a list here first. Possible opportunity for optimization. + result = await self._store.get_partial_values(list(key_ranges)) + return [prototype.buffer.from_bytes(r) for r in result] async def exists(self, key: str) -> bool: """Check if a key exists in the store. @@ -331,7 +523,18 @@ async def set(self, key: str, value: Buffer) -> None: """ return await self._store.set(key, value.to_bytes()) - async def set_virtual_ref( + async def set_if_not_exists(self, key: str, value: Buffer) -> None: + """ + Store a key to ``value`` if the key is not already present. + + Parameters + ----------- + key : str + value : Buffer + """ + return await self._store.set_if_not_exists(key, value.to_bytes()) + + async def async_set_virtual_ref( self, key: str, location: str, *, offset: int, length: int ) -> None: """Store a virtual reference to a chunk. @@ -347,7 +550,25 @@ async def set_virtual_ref( length : int The length of the chunk in bytes, measured from the given offset """ - return await self._store.set_virtual_ref(key, location, offset, length) + return await self._store.async_set_virtual_ref(key, location, offset, length) + + def set_virtual_ref( + self, key: str, location: str, *, offset: int, length: int + ) -> None: + """Store a virtual reference to a chunk. + + Parameters + ---------- + key : str + The chunk to store the reference under. This is the fully qualified zarr key eg: 'array/c/0/0/0' + location : str + The location of the chunk in storage. This is absolute path to the chunk in storage eg: 's3://bucket/path/to/file.nc' + offset : int + The offset in bytes from the start of the file location in storage the chunk starts at + length : int + The length of the chunk in bytes, measured from the given offset + """ + return self._store.set_virtual_ref(key, location, offset, length) async def delete(self, key: str) -> None: """Remove a key from the store @@ -364,7 +585,7 @@ def supports_partial_writes(self) -> bool: return self._store.supports_partial_writes async def set_partial_values( - self, key_start_values: list[tuple[str, int, BytesLike]] + self, key_start_values: Iterable[tuple[str, int, BytesLike]] ) -> None: """Store values at a given key, starting at byte range_start. @@ -375,7 +596,9 @@ async def set_partial_values( range_starts, range_starts (considering the length of the respective values) must not specify overlapping ranges for the same key """ - return await self._store.set_partial_values(key_start_values) + # NOTE: pyo3 does not implicit conversion from an Iterable to a rust iterable. So we convert it + # to a list here first. Possible opportunity for optimization. + return await self._store.set_partial_values(list(key_start_values)) @property def supports_listing(self) -> bool: @@ -431,8 +654,3 @@ def list_dir(self, prefix: str) -> AsyncGenerator[str, None]: # listing methods should not be async, so we need to # wrap the async method in a sync method. return self._store.list_dir(prefix) - - def __eq__(self, other) -> bool: - if other is self: - return True - raise NotImplementedError diff --git a/icechunk-python/python/icechunk/_icechunk_python.pyi b/icechunk-python/python/icechunk/_icechunk_python.pyi index 105245a7..bb2c424d 100644 --- a/icechunk-python/python/icechunk/_icechunk_python.pyi +++ b/icechunk-python/python/icechunk/_icechunk_python.pyi @@ -1,24 +1,45 @@ import abc import datetime -from typing import AsyncGenerator +from collections.abc import AsyncGenerator +from typing import Any class PyIcechunkStore: + def as_bytes(self) -> bytes: ... + def with_mode(self, read_only: bool) -> PyIcechunkStore: ... @property def snapshot_id(self) -> str: ... + def change_set_bytes(self) -> bytes: ... @property def branch(self) -> str | None: ... - async def checkout_snapshot(self, snapshot_id: str) -> None: ... - async def checkout_branch(self, branch: str) -> None: ... - async def checkout_tag(self, tag: str) -> None: ... - async def commit(self, message: str) -> str: ... + def checkout_snapshot(self, snapshot_id: str) -> None: ... + async def async_checkout_snapshot(self, snapshot_id: str) -> None: ... + def checkout_branch(self, branch: str) -> None: ... + async def async_checkout_branch(self, branch: str) -> None: ... + def checkout_tag(self, tag: str) -> None: ... + async def async_checkout_tag(self, tag: str) -> None: ... + def distributed_commit( + self, message: str, other_change_set_bytes: list[bytes] + ) -> str: ... + async def async_distributed_commit( + self, message: str, other_change_set_bytes: list[bytes] + ) -> str: ... + def commit(self, message: str) -> str: ... + async def async_commit(self, message: str) -> str: ... @property def has_uncommitted_changes(self) -> bool: ... - async def reset(self) -> None: ... - async def new_branch(self, branch_name: str) -> str: ... - async def tag(self, tag: str, snapshot_id: str) -> None: ... - def ancestry(self) -> PyAsyncSnapshotGenerator: ... + def reset(self) -> None: ... + async def async_reset(self) -> None: ... + def new_branch(self, branch_name: str) -> str: ... + async def async_new_branch(self, branch_name: str) -> str: ... + def reset_branch(self, snapshot_id: str) -> None: ... + async def async_reset_branch(self, snapshot_id: str) -> None: ... + def tag(self, tag: str, snapshot_id: str) -> None: ... + async def async_tag(self, tag: str, snapshot_id: str) -> None: ... + def ancestry(self) -> list[SnapshotMetadata]: ... + def async_ancestry(self) -> PyAsyncSnapshotGenerator: ... async def empty(self) -> bool: ... async def clear(self) -> None: ... + def sync_clear(self) -> None: ... async def get( self, key: str, byte_range: tuple[int | None, int | None] | None = None ) -> bytes: ... @@ -31,7 +52,13 @@ class PyIcechunkStore: @property def supports_deletes(self) -> bool: ... async def set(self, key: str, value: bytes) -> None: ... - async def set_virtual_ref(self, key: str, location: str, offset: int, length: int) -> None: ... + async def set_if_not_exists(self, key: str, value: bytes) -> None: ... + def set_virtual_ref( + self, key: str, location: str, offset: int, length: int + ) -> None: ... + async def async_set_virtual_ref( + self, key: str, location: str, offset: int, length: int + ) -> None: ... async def delete(self, key: str) -> None: ... @property def supports_partial_writes(self) -> bool: ... @@ -43,96 +70,253 @@ class PyIcechunkStore: def list(self) -> PyAsyncStringGenerator: ... def list_prefix(self, prefix: str) -> PyAsyncStringGenerator: ... def list_dir(self, prefix: str) -> PyAsyncStringGenerator: ... - def __eq__(self, other) -> bool: ... - + def __eq__(self, other: Any) -> bool: ... class PyAsyncStringGenerator(AsyncGenerator[str, None], metaclass=abc.ABCMeta): def __aiter__(self) -> PyAsyncStringGenerator: ... async def __anext__(self) -> str: ... - class SnapshotMetadata: @property def id(self) -> str: ... - @property def written_at(self) -> datetime.datetime: ... - @property def message(self) -> str: ... - -class PyAsyncSnapshotGenerator(AsyncGenerator[SnapshotMetadata, None], metaclass=abc.ABCMeta): +class PyAsyncSnapshotGenerator( + AsyncGenerator[SnapshotMetadata, None], metaclass=abc.ABCMeta +): def __aiter__(self) -> PyAsyncSnapshotGenerator: ... async def __anext__(self) -> SnapshotMetadata: ... - -class Storage: +class StorageConfig: """Storage configuration for an IcechunkStore Currently supports memory, filesystem, and S3 storage backends. - Use the class methods to create a Storage object with the desired backend. + Use the class methods to create a StorageConfig object with the desired backend. Ex: ``` - storage = Storage.memory("prefix") - storage = Storage.filesystem("/path/to/root") - storage = Storage.s3_from_env("bucket", "prefix") - storage = Storage.s3_from_credentials("bucket", "prefix", + storage_config = StorageConfig.memory("prefix") + storage_config = StorageConfig.filesystem("/path/to/root") + storage_config = StorageConfig.s3_from_env("bucket", "prefix") + storage_config = StorageConfig.s3_from_config("bucket", "prefix", ...) ``` """ class Memory: - """An in-memory storage backend""" + """Config for an in-memory storage backend""" + prefix: str class Filesystem: - """A local filesystem storage backend""" + """Config for a local filesystem storage backend""" + root: str class S3: - """An S3 Object Storage compatible storage backend""" + """Config for an S3 Object Storage compatible storage backend""" + bucket: str prefix: str - credentials: S3Credentials + credentials: S3Credentials | None endpoint_url: str | None + allow_http: bool | None + region: str | None def __init__(self, storage: Memory | Filesystem | S3): ... - @classmethod - def memory(cls, prefix: str) -> Storage: ... + def memory(cls, prefix: str) -> StorageConfig: + """Create a StorageConfig object for an in-memory storage backend with the given prefix""" + ... @classmethod - def filesystem(cls, root: str) -> Storage: ... + def filesystem(cls, root: str) -> StorageConfig: + """Create a StorageConfig object for a local filesystem storage backend with the given root directory""" + ... @classmethod - def s3_from_env(cls, bucket: str, prefix: str, endpoint_url: str | None = None) -> Storage: ... + def s3_from_env(cls, bucket: str, prefix: str) -> StorageConfig: + """Create a StorageConfig object for an S3 Object Storage compatible storage backend + with the given bucket and prefix + + This assumes that the necessary credentials are available in the environment: + AWS_REGION + AWS_ACCESS_KEY_ID, + AWS_SECRET_ACCESS_KEY, + AWS_SESSION_TOKEN (optional) + AWS_ENDPOINT_URL (optional) + AWS_ALLOW_HTTP (optional) + """ + ... @classmethod - def s3_from_credentials(cls, bucket: str, prefix: str, credentials: S3Credentials, endpoint_url: str | None) -> Storage: ... + def s3_from_config( + cls, + bucket: str, + prefix: str, + credentials: S3Credentials, + endpoint_url: str | None, + allow_http: bool | None = None, + region: str | None = None, + ) -> StorageConfig: + """Create a StorageConfig object for an S3 Object Storage compatible storage + backend with the given bucket, prefix, and configuration + This method will directly use the provided credentials to authenticate with the S3 service, + ignoring any environment variables. + """ + ... + + @classmethod + def s3_anonymous( + cls, + bucket: str, + prefix: str, + endpoint_url: str | None, + allow_http: bool | None = None, + region: str | None = None, + ) -> StorageConfig: + """Create a StorageConfig object for an S3 Object Storage compatible storage + using anonymous access + """ + ... class S3Credentials: access_key_id: str secret_access_key: str session_token: str | None - def __init__(self, access_key_id: str, secret_access_key: str, session_token: str | None = None): ... + def __init__( + self, + access_key_id: str, + secret_access_key: str, + session_token: str | None = None, + ): ... +class VirtualRefConfig: + class S3: + """Config for an S3 Object Storage compatible storage backend""" + + credentials: S3Credentials | None + endpoint_url: str | None + allow_http: bool | None + region: str | None + + @classmethod + def s3_from_env(cls) -> VirtualRefConfig: + """Create a VirtualReferenceConfig object for an S3 Object Storage compatible storage backend + with the given bucket and prefix + + This assumes that the necessary credentials are available in the environment: + AWS_REGION or AWS_DEFAULT_REGION + AWS_ACCESS_KEY_ID, + AWS_SECRET_ACCESS_KEY, + AWS_SESSION_TOKEN (optional) + AWS_ENDPOINT_URL (optional) + AWS_ALLOW_HTTP (optional) + """ + ... + + @classmethod + def s3_from_config( + cls, + credentials: S3Credentials, + *, + endpoint_url: str | None = None, + allow_http: bool | None = None, + region: str | None = None, + ) -> VirtualRefConfig: + """Create a VirtualReferenceConfig object for an S3 Object Storage compatible storage + backend with the given bucket, prefix, and configuration + + This method will directly use the provided credentials to authenticate with the S3 service, + ignoring any environment variables. + """ + ... + + @classmethod + def s3_anonymous( + cls, + *, + endpoint_url: str | None = None, + allow_http: bool | None = None, + region: str | None = None, + ) -> VirtualRefConfig: + """Create a VirtualReferenceConfig object for an S3 Object Storage compatible storage + using anonymous access + """ + ... + +class KeyNotFound(Exception): + def __init__( + self, + info: Any + ): ... class StoreConfig: + """Configuration for an IcechunkStore""" + + # The number of concurrent requests to make when fetching partial values get_partial_values_concurrency: int | None - inline_chunk_threshold: int | None + # The threshold at which to inline chunks in the store in bytes. When set, + # chunks smaller than this threshold will be inlined in the store. Default is + # 512 bytes. + inline_chunk_threshold_bytes: int | None + # Whether to allow overwriting refs in the store. Default is False. Experimental. unsafe_overwrite_refs: bool | None + # Configurations for virtual references such as credentials and endpoints + virtual_ref_config: VirtualRefConfig | None def __init__( self, get_partial_values_concurrency: int | None = None, - inline_chunk_threshold: int | None = None, + inline_chunk_threshold_bytes: int | None = None, unsafe_overwrite_refs: bool | None = None, - ): ... + virtual_ref_config: VirtualRefConfig | None = None, + ): + """Create a StoreConfig object with the given configuration options + + Parameters + ---------- + get_partial_values_concurrency: int | None + The number of concurrent requests to make when fetching partial values + inline_chunk_threshold_bytes: int | None + The threshold at which to inline chunks in the store in bytes. When set, + chunks smaller than this threshold will be inlined in the store. Default is + 512 bytes when not specified. + unsafe_overwrite_refs: bool | None + Whether to allow overwriting refs in the store. Default is False. Experimental. + virtual_ref_config: VirtualRefConfig | None + Configurations for virtual references such as credentials and endpoints + + Returns + ------- + StoreConfig + A StoreConfig object with the given configuration options + """ + ... + +async def async_pyicechunk_store_exists(storage: StorageConfig) -> bool: ... +def pyicechunk_store_exists(storage: StorageConfig) -> bool: ... + +async def async_pyicechunk_store_create( + storage: StorageConfig, config: StoreConfig | None +) -> PyIcechunkStore: ... +def pyicechunk_store_create( + storage: StorageConfig, config: StoreConfig | None +) -> PyIcechunkStore: ... + +async def async_pyicechunk_store_open_existing( + storage: StorageConfig, read_only: bool, config: StoreConfig | None +) -> PyIcechunkStore: ... +def pyicechunk_store_open_existing( + storage: StorageConfig, read_only: bool, config: StoreConfig | None +) -> PyIcechunkStore: ... +# async def pyicechunk_store_from_json_config( +# config: str, read_only: bool +# ) -> PyIcechunkStore: ... +def pyicechunk_store_from_bytes(bytes: bytes, read_only: bool) -> PyIcechunkStore: ... -async def pyicechunk_store_exists(storage: Storage) -> bool: ... -async def pyicechunk_store_create(storage: Storage, config: StoreConfig = StoreConfig()) -> PyIcechunkStore: ... -async def pyicechunk_store_open_existing(storage: Storage, read_only: bool, config: StoreConfig = StoreConfig()) -> PyIcechunkStore: ... -async def pyicechunk_store_from_json_config(config: str, read_only: bool) -> PyIcechunkStore: ... +__version__: str diff --git a/icechunk-python/src/errors.rs b/icechunk-python/src/errors.rs index 9458fa48..8fd133dd 100644 --- a/icechunk-python/src/errors.rs +++ b/icechunk-python/src/errors.rs @@ -1,17 +1,22 @@ use icechunk::{ format::IcechunkFormatError, repository::RepositoryError, zarr::StoreError, }; -use pyo3::{exceptions::PyValueError, PyErr}; +use pyo3::{ + exceptions::{PyException, PyValueError}, + PyErr, +}; use thiserror::Error; /// A simple wrapper around the StoreError to make it easier to convert to a PyErr /// /// When you use the ? operator, the error is coerced. But if you return the value it is not. -/// So for now we just use the extra operation to get the coersion instead of manually mapping +/// So for now we just use the extra operation to get the coercion instead of manually mapping /// the errors where this is returned from a python class #[allow(clippy::enum_variant_names)] #[derive(Debug, Error)] pub(crate) enum PyIcechunkStoreError { + #[error("key not found error: {0}")] + KeyNotFound(#[from] KeyNotFound), #[error("store error: {0}")] StoreError(#[from] StoreError), #[error("repository Error: {0}")] @@ -33,3 +38,10 @@ impl From for PyErr { } pub(crate) type PyIcechunkStoreResult = Result; + +pyo3::create_exception!( + _icechunk_python, + KeyNotFound, + PyException, + "The key is not present in the repository" +); diff --git a/icechunk-python/src/lib.rs b/icechunk-python/src/lib.rs index 4ecdc4cb..224be2ff 100644 --- a/icechunk-python/src/lib.rs +++ b/icechunk-python/src/lib.rs @@ -2,7 +2,7 @@ mod errors; mod storage; mod streams; -use std::sync::Arc; +use std::{borrow::Cow, sync::Arc}; use ::icechunk::{format::ChunkOffset, Store}; use bytes::Bytes; @@ -13,21 +13,31 @@ use icechunk::{ format::{manifest::VirtualChunkRef, ChunkLength}, refs::Ref, repository::VirtualChunkLocation, + storage::virtual_ref::ObjectStoreVirtualChunkResolverConfig, zarr::{ - ConsolidatedStore, ObjectId, RepositoryConfig, StorageConfig, StoreOptions, - VersionInfo, + ConsolidatedStore, ObjectId, RepositoryConfig, StorageConfig, StoreError, + StoreOptions, VersionInfo, }, Repository, SnapshotMetadata, }; -use pyo3::{exceptions::PyValueError, prelude::*, types::PyBytes}; -use storage::{PyS3Credentials, PyStorage}; +use pyo3::{ + exceptions::PyValueError, + prelude::*, + types::{PyBytes, PyList, PyNone, PyString}, +}; +use storage::{PyS3Credentials, PyStorageConfig, PyVirtualRefConfig}; use streams::PyAsyncGenerator; -use tokio::sync::{Mutex, RwLock}; +use tokio::{ + runtime::Runtime, + sync::{Mutex, RwLock}, +}; + +pub use errors::KeyNotFound; #[pyclass] struct PyIcechunkStore { + consolidated: ConsolidatedStore, store: Arc>, - rt: tokio::runtime::Runtime, } #[pyclass(name = "StoreConfig")] @@ -36,17 +46,24 @@ struct PyStoreConfig { #[pyo3(get, set)] pub get_partial_values_concurrency: Option, #[pyo3(get, set)] - pub inline_chunk_threshold: Option, + pub inline_chunk_threshold_bytes: Option, #[pyo3(get, set)] pub unsafe_overwrite_refs: Option, + #[pyo3(get, set)] + pub virtual_ref_config: Option, } impl From<&PyStoreConfig> for RepositoryConfig { fn from(config: &PyStoreConfig) -> Self { RepositoryConfig { version: None, - inline_chunk_threshold_bytes: config.inline_chunk_threshold, + inline_chunk_threshold_bytes: config.inline_chunk_threshold_bytes, unsafe_overwrite_refs: config.unsafe_overwrite_refs, + change_set_bytes: None, + virtual_ref_config: config + .virtual_ref_config + .as_ref() + .map(ObjectStoreVirtualChunkResolverConfig::from), } } } @@ -68,13 +85,15 @@ impl PyStoreConfig { #[new] fn new( get_partial_values_concurrency: Option, - inline_chunk_threshold: Option, + inline_chunk_threshold_bytes: Option, unsafe_overwrite_refs: Option, + virtual_ref_config: Option, ) -> Self { PyStoreConfig { get_partial_values_concurrency, - inline_chunk_threshold, + inline_chunk_threshold_bytes, unsafe_overwrite_refs, + virtual_ref_config, } } } @@ -102,9 +121,15 @@ impl From for PySnapshotMetadata { type KeyRanges = Vec<(String, (Option, Option))>; impl PyIcechunkStore { + pub(crate) fn consolidated(&self) -> &ConsolidatedStore { + &self.consolidated + } + async fn store_exists(storage: StorageConfig) -> PyIcechunkStoreResult { - let storage = - storage.make_cached_storage().map_err(PyIcechunkStoreError::UnkownError)?; + let storage = storage + .make_cached_storage() + .await + .map_err(PyIcechunkStoreError::UnkownError)?; let exists = Repository::exists(storage.as_ref()).await?; Ok(exists) } @@ -115,20 +140,12 @@ impl PyIcechunkStore { repository_config: RepositoryConfig, store_config: StoreOptions, ) -> Result { - let access_mode = if read_only { - icechunk::zarr::AccessMode::ReadOnly - } else { - icechunk::zarr::AccessMode::ReadWrite - }; let repository = repository_config .with_version(VersionInfo::BranchTipRef(Ref::DEFAULT_BRANCH.to_string())); - let config = + let consolidated = ConsolidatedStore { storage, repository, config: Some(store_config) }; - let store = Store::from_consolidated(&config, access_mode).await?; - let store = Arc::new(RwLock::new(store)); - let rt = tokio::runtime::Runtime::new().map_err(|e| e.to_string())?; - Ok(Self { store, rt }) + PyIcechunkStore::from_consolidated(consolidated, read_only).await } async fn create( @@ -136,55 +153,77 @@ impl PyIcechunkStore { repository_config: RepositoryConfig, store_config: StoreOptions, ) -> Result { - let config = ConsolidatedStore { + let consolidated = ConsolidatedStore { storage, repository: repository_config, config: Some(store_config), }; - let store = - Store::from_consolidated(&config, icechunk::zarr::AccessMode::ReadWrite) - .await?; - let store = Arc::new(RwLock::new(store)); - let rt = tokio::runtime::Runtime::new().map_err(|e| e.to_string())?; - Ok(Self { store, rt }) + PyIcechunkStore::from_consolidated(consolidated, false).await } - async fn from_json_config(json: &[u8], read_only: bool) -> Result { + async fn from_consolidated( + consolidated: ConsolidatedStore, + read_only: bool, + ) -> Result { let access_mode = if read_only { icechunk::zarr::AccessMode::ReadOnly } else { icechunk::zarr::AccessMode::ReadWrite }; - let store = Store::from_json(json, access_mode).await?; + + let store = Store::from_consolidated(&consolidated, access_mode).await?; let store = Arc::new(RwLock::new(store)); - let rt = tokio::runtime::Runtime::new().map_err(|e| e.to_string())?; - Ok(Self { store, rt }) + Ok(Self { consolidated, store }) + } + + async fn as_consolidated(&self) -> PyIcechunkStoreResult { + let consolidated = self.consolidated.clone(); + + let store = self.store.read().await; + let version = store.current_version().await; + let change_set = store.change_set_bytes().await?; + + let consolidated = + consolidated.with_version(version).with_change_set_bytes(change_set)?; + Ok(consolidated) } } +fn mk_runtime() -> PyResult { + Ok(tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .map_err(|e| PyIcechunkStoreError::UnkownError(e.to_string()))?) +} + #[pyfunction] -fn pyicechunk_store_from_json_config( - py: Python<'_>, - json: String, +fn pyicechunk_store_open_existing( + storage: &PyStorageConfig, read_only: bool, -) -> PyResult> { - let json = json.as_bytes().to_owned(); + config: PyStoreConfig, +) -> PyResult { + let storage = storage.into(); + let repository_config = (&config).into(); + let store_config = (&config).into(); - // The commit mechanism is async and calls tokio::spawn so we need to use the - // pyo3_asyncio_0_21::tokio helper to run the async function in the tokio runtime - pyo3_asyncio_0_21::tokio::future_into_py(py, async move { - PyIcechunkStore::from_json_config(&json, read_only) - .await - .map_err(PyValueError::new_err) + let rt = mk_runtime()?; + rt.block_on(async move { + PyIcechunkStore::open_existing( + storage, + read_only, + repository_config, + store_config, + ) + .await + .map_err(PyValueError::new_err) }) } #[pyfunction] -#[pyo3(signature = (storage, read_only, config=PyStoreConfig::default()))] -fn pyicechunk_store_open_existing<'py>( +fn async_pyicechunk_store_open_existing<'py>( py: Python<'py>, - storage: &'py PyStorage, + storage: &'py PyStorageConfig, read_only: bool, config: PyStoreConfig, ) -> PyResult> { @@ -204,9 +243,9 @@ fn pyicechunk_store_open_existing<'py>( } #[pyfunction] -fn pyicechunk_store_exists<'py>( +fn async_pyicechunk_store_exists<'py>( py: Python<'py>, - storage: &'py PyStorage, + storage: &'py PyStorageConfig, ) -> PyResult> { let storage = storage.into(); pyo3_asyncio_0_21::tokio::future_into_py(py, async move { @@ -215,10 +254,18 @@ fn pyicechunk_store_exists<'py>( } #[pyfunction] -#[pyo3(signature = (storage, config=PyStoreConfig::default()))] -fn pyicechunk_store_create<'py>( +fn pyicechunk_store_exists(storage: &PyStorageConfig) -> PyResult { + let storage = storage.into(); + let rt = mk_runtime()?; + rt.block_on(async move { + PyIcechunkStore::store_exists(storage).await.map_err(PyErr::from) + }) +} + +#[pyfunction] +fn async_pyicechunk_store_create<'py>( py: Python<'py>, - storage: &'py PyStorage, + storage: &'py PyStorageConfig, config: PyStoreConfig, ) -> PyResult> { let storage = storage.into(); @@ -231,27 +278,102 @@ fn pyicechunk_store_create<'py>( }) } +#[pyfunction] +fn pyicechunk_store_create( + storage: &PyStorageConfig, + config: PyStoreConfig, +) -> PyResult { + let storage = storage.into(); + let repository_config = (&config).into(); + let store_config = (&config).into(); + let rt = mk_runtime()?; + rt.block_on(async move { + PyIcechunkStore::create(storage, repository_config, store_config) + .await + .map_err(PyValueError::new_err) + }) +} + +#[pyfunction] +fn pyicechunk_store_from_bytes( + bytes: Cow<[u8]>, + read_only: bool, +) -> PyResult { + // FIXME: Use rmp_serde instead of serde_json to optimize performance + let consolidated: ConsolidatedStore = serde_json::from_slice(&bytes) + .map_err(|e| PyValueError::new_err(e.to_string()))?; + + let rt = mk_runtime()?; + let store = rt.block_on(async move { + PyIcechunkStore::from_consolidated(consolidated, read_only) + .await + .map_err(PyValueError::new_err) + })?; + + Ok(store) +} + #[pymethods] impl PyIcechunkStore { - fn checkout_snapshot<'py>( + fn __eq__(&self, other: &Self) -> bool { + self.consolidated.storage == other.consolidated().storage + } + + fn as_bytes(&self) -> PyResult> { + let consolidated = + pyo3_asyncio_0_21::tokio::get_runtime().block_on(self.as_consolidated())?; + + // FIXME: Use rmp_serde instead of serde_json to optimize performance + let serialized = serde_json::to_vec(&consolidated) + .map_err(|e| PyValueError::new_err(e.to_string()))?; + Ok(Cow::Owned(serialized)) + } + + fn with_mode(&self, read_only: bool) -> PyResult { + let access_mode = if read_only { + icechunk::zarr::AccessMode::ReadOnly + } else { + icechunk::zarr::AccessMode::ReadWrite + }; + + let readable_store = self.store.blocking_read(); + let consolidated = + pyo3_asyncio_0_21::tokio::get_runtime().block_on(self.as_consolidated())?; + let store = Arc::new(RwLock::new(readable_store.with_access_mode(access_mode))); + Ok(PyIcechunkStore { consolidated, store }) + } + + fn async_checkout_snapshot<'py>( &'py self, py: Python<'py>, snapshot_id: String, ) -> PyResult> { - let snapshot_id = ObjectId::try_from(snapshot_id.as_str()).map_err(|e| { - PyIcechunkStoreError::UnkownError(format!( - "Error checking out snapshot {snapshot_id}: {e}" - )) - })?; + let store = Arc::clone(&self.store); + pyo3_asyncio_0_21::tokio::future_into_py(py, async move { + do_checkout_snapshot(store, snapshot_id).await + }) + } + fn checkout_snapshot<'py>( + &'py self, + py: Python<'py>, + snapshot_id: String, + ) -> PyResult> { + let store = Arc::clone(&self.store); + pyo3_asyncio_0_21::tokio::get_runtime().block_on(async move { + do_checkout_snapshot(store, snapshot_id).await?; + Ok(PyNone::get_bound(py).to_owned()) + }) + } + + fn async_checkout_branch<'py>( + &'py self, + py: Python<'py>, + branch: String, + ) -> PyResult> { let store = Arc::clone(&self.store); pyo3_asyncio_0_21::tokio::future_into_py(py, async move { - let mut store = store.write().await; - store - .checkout(VersionInfo::SnapshotId(snapshot_id)) - .await - .map_err(PyIcechunkStoreError::StoreError)?; - Ok(()) + do_checkout_branch(store, branch).await }) } @@ -259,15 +381,22 @@ impl PyIcechunkStore { &'py self, py: Python<'py>, branch: String, + ) -> PyResult> { + let store = Arc::clone(&self.store); + pyo3_asyncio_0_21::tokio::get_runtime().block_on(async move { + do_checkout_branch(store, branch).await?; + Ok(PyNone::get_bound(py).to_owned()) + }) + } + + fn async_checkout_tag<'py>( + &'py self, + py: Python<'py>, + tag: String, ) -> PyResult> { let store = Arc::clone(&self.store); pyo3_asyncio_0_21::tokio::future_into_py(py, async move { - let mut store = store.write().await; - store - .checkout(VersionInfo::BranchTipRef(branch)) - .await - .map_err(PyIcechunkStoreError::StoreError)?; - Ok(()) + do_checkout_tag(store, tag).await }) } @@ -275,44 +404,81 @@ impl PyIcechunkStore { &'py self, py: Python<'py>, tag: String, - ) -> PyResult> { + ) -> PyResult> { let store = Arc::clone(&self.store); - pyo3_asyncio_0_21::tokio::future_into_py(py, async move { - let mut store = store.write().await; - store - .checkout(VersionInfo::TagRef(tag)) - .await - .map_err(PyIcechunkStoreError::StoreError)?; - Ok(()) + pyo3_asyncio_0_21::tokio::get_runtime().block_on(async move { + do_checkout_tag(store, tag).await?; + Ok(PyNone::get_bound(py).to_owned()) }) } #[getter] fn snapshot_id(&self) -> PyIcechunkStoreResult { let store = self.store.blocking_read(); - let snapshot_id = self.rt.block_on(store.snapshot_id()); + let snapshot_id = + pyo3_asyncio_0_21::tokio::get_runtime().block_on(store.snapshot_id()); Ok(snapshot_id.to_string()) } - fn commit<'py>( + fn async_commit<'py>( &'py self, py: Python<'py>, message: String, ) -> PyResult> { let store = Arc::clone(&self.store); - // The commit mechanism is async and calls tokio::spawn so we need to use the - // pyo3_asyncio_0_21::tokio helper to run the async function in the tokio runtime pyo3_asyncio_0_21::tokio::future_into_py(py, async move { - let mut writeable_store = store.write().await; - let oid = writeable_store - .commit(&message) - .await - .map_err(PyIcechunkStoreError::from)?; - Ok(String::from(&oid)) + do_commit(store, message).await }) } + fn commit<'py>( + &'py self, + py: Python<'py>, + message: String, + ) -> PyResult> { + let store = Arc::clone(&self.store); + + pyo3_asyncio_0_21::tokio::get_runtime().block_on(async move { + let res = do_commit(store, message).await?; + Ok(PyString::new_bound(py, res.as_str())) + }) + } + + fn async_distributed_commit<'py>( + &'py self, + py: Python<'py>, + message: String, + other_change_set_bytes: Vec>, + ) -> PyResult> { + let store = Arc::clone(&self.store); + pyo3_asyncio_0_21::tokio::future_into_py(py, async move { + do_distributed_commit(store, message, other_change_set_bytes).await + }) + } + + fn distributed_commit<'py>( + &'py self, + py: Python<'py>, + message: String, + other_change_set_bytes: Vec>, + ) -> PyResult> { + let store = Arc::clone(&self.store); + pyo3_asyncio_0_21::tokio::get_runtime().block_on(async move { + let res = + do_distributed_commit(store, message, other_change_set_bytes).await?; + Ok(PyString::new_bound(py, res.as_str())) + }) + } + + fn change_set_bytes(&self) -> PyIcechunkStoreResult> { + let store = self.store.blocking_read(); + let res = pyo3_asyncio_0_21::tokio::get_runtime() + .block_on(store.change_set_bytes()) + .map_err(PyIcechunkStoreError::from)?; + Ok(res) + } + #[getter] fn branch(&self) -> PyIcechunkStoreResult> { let store = self.store.blocking_read(); @@ -323,21 +489,32 @@ impl PyIcechunkStore { #[getter] fn has_uncommitted_changes(&self) -> PyIcechunkStoreResult { let store = self.store.blocking_read(); - let has_uncommitted_changes = self.rt.block_on(store.has_uncommitted_changes()); + let has_uncommitted_changes = pyo3_asyncio_0_21::tokio::get_runtime() + .block_on(store.has_uncommitted_changes()); Ok(has_uncommitted_changes) } - fn reset<'py>(&'py self, py: Python<'py>) -> PyResult> { + fn async_reset<'py>(&'py self, py: Python<'py>) -> PyResult> { + let store = Arc::clone(&self.store); + pyo3_asyncio_0_21::tokio::future_into_py(py, async move { do_reset(store).await }) + } + + fn reset<'py>(&'py self, py: Python<'py>) -> PyResult> { let store = Arc::clone(&self.store); + pyo3_asyncio_0_21::tokio::get_runtime().block_on(async move { + do_reset(store).await?; + Ok(PyNone::get_bound(py).to_owned()) + }) + } + fn async_new_branch<'py>( + &'py self, + py: Python<'py>, + branch_name: String, + ) -> PyResult> { + let store = Arc::clone(&self.store); pyo3_asyncio_0_21::tokio::future_into_py(py, async move { - store - .write() - .await - .reset() - .await - .map_err(PyIcechunkStoreError::StoreError)?; - Ok(()) + do_new_branch(store, branch_name).await }) } @@ -345,22 +522,38 @@ impl PyIcechunkStore { &'py self, py: Python<'py>, branch_name: String, - ) -> PyResult> { + ) -> PyResult> { let store = Arc::clone(&self.store); + pyo3_asyncio_0_21::tokio::get_runtime().block_on(async move { + let res = do_new_branch(store, branch_name).await?; + Ok(PyString::new_bound(py, res.as_str())) + }) + } - // The commit mechanism is async and calls tokio::spawn so we need to use the - // pyo3_asyncio_0_21::tokio helper to run the async function in the tokio runtime + fn async_reset_branch<'py>( + &'py self, + py: Python<'py>, + to_snapshot: String, + ) -> PyResult> { + let store = Arc::clone(&self.store); pyo3_asyncio_0_21::tokio::future_into_py(py, async move { - let mut writeable_store = store.write().await; - let (oid, _version) = writeable_store - .new_branch(&branch_name) - .await - .map_err(PyIcechunkStoreError::from)?; - Ok(String::from(&oid)) + do_reset_branch(store, to_snapshot).await }) } - fn tag<'py>( + fn reset_branch<'py>( + &'py self, + py: Python<'py>, + to_snapshot: String, + ) -> PyResult> { + let store = Arc::clone(&self.store); + pyo3_asyncio_0_21::tokio::get_runtime().block_on(async move { + do_reset_branch(store, to_snapshot).await?; + Ok(PyNone::get_bound(py).to_owned()) + }) + } + + fn async_tag<'py>( &'py self, py: Python<'py>, tag: String, @@ -371,17 +564,45 @@ impl PyIcechunkStore { // The commit mechanism is async and calls tokio::spawn so we need to use the // pyo3_asyncio_0_21::tokio helper to run the async function in the tokio runtime pyo3_asyncio_0_21::tokio::future_into_py(py, async move { - let mut writeable_store = store.write().await; - let oid = ObjectId::try_from(snapshot_id.as_str()) - .map_err(|e| PyIcechunkStoreError::UnkownError(e.to_string()))?; - writeable_store.tag(&tag, &oid).await.map_err(PyIcechunkStoreError::from)?; - Ok(()) + do_tag(store, tag, snapshot_id).await + }) + } + + fn tag<'py>( + &'py self, + py: Python<'py>, + tag: String, + snapshot_id: String, + ) -> PyResult> { + let store = Arc::clone(&self.store); + pyo3_asyncio_0_21::tokio::get_runtime().block_on(async move { + do_tag(store, tag, snapshot_id).await?; + Ok(PyNone::get_bound(py).to_owned()) }) } - fn ancestry(&self) -> PyIcechunkStoreResult { - let list = self - .rt + fn ancestry<'py>( + &'py self, + py: Python<'py>, + ) -> PyIcechunkStoreResult> { + // TODO: this holds everything in memory + pyo3_asyncio_0_21::tokio::get_runtime().block_on(async move { + let store = self.store.read().await; + let list = store + .ancestry() + .await? + .map_ok(|parent| { + let parent = Into::::into(parent); + Python::with_gil(|py| parent.into_py(py)) + }) + .try_collect::>() + .await?; + Ok(PyList::new_bound(py, list)) + }) + } + + fn async_ancestry(&self) -> PyIcechunkStoreResult { + let list = pyo3_asyncio_0_21::tokio::get_runtime() .block_on(async move { let store = self.store.read().await; store.ancestry().await @@ -411,6 +632,15 @@ impl PyIcechunkStore { }) } + fn sync_clear<'py>(&'py self, py: Python<'py>) -> PyResult> { + let store = Arc::clone(&self.store); + + pyo3_asyncio_0_21::tokio::get_runtime().block_on(async move { + store.write().await.clear().await.map_err(PyIcechunkStoreError::from)?; + Ok(PyNone::get_bound(py).to_owned()) + }) + } + fn get<'py>( &'py self, py: Python<'py>, @@ -420,17 +650,20 @@ impl PyIcechunkStore { let store = Arc::clone(&self.store); pyo3_asyncio_0_21::tokio::future_into_py(py, async move { let byte_range = byte_range.unwrap_or((None, None)).into(); - let data = store - .read() - .await - .get(&key, &byte_range) - .await - .map_err(PyIcechunkStoreError::from)?; - let pybytes = Python::with_gil(|py| { - let bound_bytes = PyBytes::new_bound(py, &data); - bound_bytes.to_object(py) - }); - Ok(pybytes) + let data = store.read().await.get(&key, &byte_range).await; + // We need to distinguish the "safe" case of trying to fetch an uninitialized key + // from other types of errors, we use KeyNotFound exception for that + match data { + Ok(data) => { + let pybytes = Python::with_gil(|py| { + let bound_bytes = PyBytes::new_bound(py, &data); + bound_bytes.to_object(py) + }); + Ok(pybytes) + } + Err(StoreError::NotFound(_)) => Err(KeyNotFound::new_err(key)), + Err(err) => Err(PyIcechunkStoreError::StoreError(err).into()), + } }) } @@ -448,6 +681,7 @@ impl PyIcechunkStore { .await .map_err(PyIcechunkStoreError::StoreError)?; + // FIXME: this processing is hiding errors in certain keys let result = partial_values_stream .into_iter() // If we want to error instead of returning None we can collect into @@ -520,31 +754,53 @@ impl PyIcechunkStore { }) } - fn set_virtual_ref<'py>( + fn set_if_not_exists<'py>( &'py self, py: Python<'py>, key: String, - location: String, - offset: ChunkOffset, - length: ChunkLength, + value: Vec, ) -> PyResult> { let store = Arc::clone(&self.store); pyo3_asyncio_0_21::tokio::future_into_py(py, async move { - let virtual_ref = VirtualChunkRef { - location: VirtualChunkLocation::Absolute(location), - offset, - length, - }; - let mut store = store.write().await; + let store = store.read().await; store - .set_virtual_ref(&key, virtual_ref) + .set_if_not_exists(&key, Bytes::from(value)) .await .map_err(PyIcechunkStoreError::from)?; Ok(()) }) } + fn async_set_virtual_ref<'py>( + &'py self, + py: Python<'py>, + key: String, + location: String, + offset: ChunkOffset, + length: ChunkLength, + ) -> PyResult> { + let store = Arc::clone(&self.store); + pyo3_asyncio_0_21::tokio::future_into_py(py, async move { + do_set_virtual_ref(store, key, location, offset, length).await + }) + } + + fn set_virtual_ref<'py>( + &'py self, + py: Python<'py>, + key: String, + location: String, + offset: ChunkOffset, + length: ChunkLength, + ) -> PyResult> { + let store = Arc::clone(&self.store); + pyo3_asyncio_0_21::tokio::get_runtime().block_on(async move { + do_set_virtual_ref(store, key, location, offset, length).await?; + Ok(PyNone::get_bound(py).to_owned()) + }) + } + fn delete<'py>( &'py self, py: Python<'py>, @@ -616,8 +872,7 @@ impl PyIcechunkStore { } fn list(&self) -> PyIcechunkStoreResult { - let list = self - .rt + let list = pyo3_asyncio_0_21::tokio::get_runtime() .block_on(async move { let store = self.store.read().await; store.list().await @@ -629,8 +884,7 @@ impl PyIcechunkStore { } fn list_prefix(&self, prefix: String) -> PyIcechunkStoreResult { - let list = self - .rt + let list = pyo3_asyncio_0_21::tokio::get_runtime() .block_on(async move { let store = self.store.read().await; store.list_prefix(prefix.as_str()).await @@ -641,8 +895,7 @@ impl PyIcechunkStore { } fn list_dir(&self, prefix: String) -> PyIcechunkStoreResult { - let list = self - .rt + let list = pyo3_asyncio_0_21::tokio::get_runtime() .block_on(async move { let store = self.store.read().await; store.list_dir(prefix.as_str()).await @@ -653,17 +906,138 @@ impl PyIcechunkStore { } } +async fn do_commit(store: Arc>, message: String) -> PyResult { + let mut store = store.write().await; + let oid = store.commit(&message).await.map_err(PyIcechunkStoreError::from)?; + Ok(String::from(&oid)) +} + +async fn do_checkout_snapshot( + store: Arc>, + snapshot_id: String, +) -> PyResult<()> { + let snapshot_id = ObjectId::try_from(snapshot_id.as_str()).map_err(|e| { + PyIcechunkStoreError::UnkownError(format!( + "Error checking out snapshot {snapshot_id}: {e}" + )) + })?; + + let mut store = store.write().await; + store + .checkout(VersionInfo::SnapshotId(snapshot_id)) + .await + .map_err(PyIcechunkStoreError::StoreError)?; + Ok(()) +} + +async fn do_checkout_branch(store: Arc>, branch: String) -> PyResult<()> { + let mut store = store.write().await; + store + .checkout(VersionInfo::BranchTipRef(branch)) + .await + .map_err(PyIcechunkStoreError::StoreError)?; + Ok(()) +} + +async fn do_checkout_tag(store: Arc>, tag: String) -> PyResult<()> { + let mut store = store.write().await; + store + .checkout(VersionInfo::TagRef(tag)) + .await + .map_err(PyIcechunkStoreError::StoreError)?; + Ok(()) +} + +async fn do_distributed_commit( + store: Arc>, + message: String, + other_change_set_bytes: Vec>, +) -> PyResult { + let mut writeable_store = store.write().await; + let oid = writeable_store + .distributed_commit(&message, other_change_set_bytes) + .await + .map_err(PyIcechunkStoreError::from)?; + Ok(String::from(&oid)) +} + +async fn do_reset<'py>(store: Arc>) -> PyResult<()> { + store.write().await.reset().await.map_err(PyIcechunkStoreError::StoreError)?; + Ok(()) +} + +async fn do_new_branch<'py>( + store: Arc>, + branch_name: String, +) -> PyResult { + let mut writeable_store = store.write().await; + let (oid, _version) = writeable_store + .new_branch(&branch_name) + .await + .map_err(PyIcechunkStoreError::from)?; + Ok(String::from(&oid)) +} + +async fn do_reset_branch<'py>( + store: Arc>, + to_snapshot: String, +) -> PyResult<()> { + let to_snapshot = ObjectId::try_from(to_snapshot.as_str()) + .map_err(|e| PyIcechunkStoreError::UnkownError(e.to_string()))?; + let mut writeable_store = store.write().await; + writeable_store + .reset_branch(to_snapshot) + .await + .map_err(PyIcechunkStoreError::from)?; + Ok(()) +} + +async fn do_tag<'py>( + store: Arc>, + tag: String, + snapshot_id: String, +) -> PyResult<()> { + let mut writeable_store = store.write().await; + let oid = ObjectId::try_from(snapshot_id.as_str()) + .map_err(|e| PyIcechunkStoreError::UnkownError(e.to_string()))?; + writeable_store.tag(&tag, &oid).await.map_err(PyIcechunkStoreError::from)?; + Ok(()) +} + +async fn do_set_virtual_ref( + store: Arc>, + key: String, + location: String, + offset: ChunkOffset, + length: ChunkLength, +) -> PyResult<()> { + let virtual_ref = VirtualChunkRef { + location: VirtualChunkLocation::Absolute(location), + offset, + length, + }; + let mut store = store.write().await; + store.set_virtual_ref(&key, virtual_ref).await.map_err(PyIcechunkStoreError::from)?; + Ok(()) +} + /// The icechunk Python module implemented in Rust. #[pymodule] -fn _icechunk_python(m: &Bound<'_, PyModule>) -> PyResult<()> { - m.add_class::()?; +fn _icechunk_python(py: Python<'_>, m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add("__version__", env!("CARGO_PKG_VERSION"))?; + m.add("KeyNotFound", py.get_type_bound::())?; + m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; m.add_class::()?; - m.add_function(wrap_pyfunction!(pyicechunk_store_from_json_config, m)?)?; + m.add_class::()?; m.add_function(wrap_pyfunction!(pyicechunk_store_exists, m)?)?; + m.add_function(wrap_pyfunction!(async_pyicechunk_store_exists, m)?)?; m.add_function(wrap_pyfunction!(pyicechunk_store_create, m)?)?; + m.add_function(wrap_pyfunction!(async_pyicechunk_store_create, m)?)?; m.add_function(wrap_pyfunction!(pyicechunk_store_open_existing, m)?)?; + m.add_function(wrap_pyfunction!(async_pyicechunk_store_open_existing, m)?)?; + m.add_function(wrap_pyfunction!(pyicechunk_store_from_bytes, m)?)?; Ok(()) } diff --git a/icechunk-python/src/storage.rs b/icechunk-python/src/storage.rs index 4c0a6f1c..a5e2d0fd 100644 --- a/icechunk-python/src/storage.rs +++ b/icechunk-python/src/storage.rs @@ -1,6 +1,15 @@ +#![allow(clippy::too_many_arguments)] +// TODO: we only need that allow for PyStorageConfig, but i don't know how to set it + use std::path::PathBuf; -use icechunk::{storage::object_store::S3Credentials, zarr::StorageConfig}; +use icechunk::{ + storage::{ + s3::{S3Config, S3Credentials, StaticS3Credentials}, + virtual_ref::ObjectStoreVirtualChunkResolverConfig, + }, + zarr::StorageConfig, +}; use pyo3::{prelude::*, types::PyType}; #[pyclass(name = "S3Credentials")] @@ -14,9 +23,9 @@ pub struct PyS3Credentials { session_token: Option, } -impl From<&PyS3Credentials> for S3Credentials { +impl From<&PyS3Credentials> for StaticS3Credentials { fn from(credentials: &PyS3Credentials) -> Self { - S3Credentials { + StaticS3Credentials { access_key_id: credentials.access_key_id.clone(), secret_access_key: credentials.secret_access_key.clone(), session_token: credentials.session_token.clone(), @@ -36,8 +45,8 @@ impl PyS3Credentials { } } -#[pyclass(name = "Storage")] -pub enum PyStorage { +#[pyclass(name = "StorageConfig")] +pub enum PyStorageConfig { Memory { prefix: Option, }, @@ -47,21 +56,24 @@ pub enum PyStorage { S3 { bucket: String, prefix: String, + anon: bool, credentials: Option, endpoint_url: Option, + allow_http: Option, + region: Option, }, } #[pymethods] -impl PyStorage { +impl PyStorageConfig { #[classmethod] fn memory(_cls: &Bound<'_, PyType>, prefix: Option) -> Self { - PyStorage::Memory { prefix } + PyStorageConfig::Memory { prefix } } #[classmethod] fn filesystem(_cls: &Bound<'_, PyType>, root: String) -> Self { - PyStorage::Filesystem { root } + PyStorageConfig::Filesystem { root } } #[classmethod] @@ -70,39 +82,183 @@ impl PyStorage { bucket: String, prefix: String, endpoint_url: Option, + allow_http: Option, + region: Option, ) -> Self { - PyStorage::S3 { bucket, prefix, credentials: None, endpoint_url } + PyStorageConfig::S3 { + bucket, + prefix, + anon: false, + credentials: None, + endpoint_url, + allow_http, + region, + } } #[classmethod] - fn s3_from_credentials( + fn s3_from_config( _cls: &Bound<'_, PyType>, bucket: String, prefix: String, credentials: PyS3Credentials, endpoint_url: Option, + allow_http: Option, + region: Option, ) -> Self { - PyStorage::S3 { bucket, prefix, credentials: Some(credentials), endpoint_url } + PyStorageConfig::S3 { + bucket, + prefix, + anon: false, + credentials: Some(credentials), + endpoint_url, + allow_http, + region, + } + } + + #[classmethod] + fn s3_anonymous( + _cls: &Bound<'_, PyType>, + bucket: String, + prefix: String, + endpoint_url: Option, + allow_http: Option, + region: Option, + ) -> Self { + PyStorageConfig::S3 { + bucket, + prefix, + anon: true, + credentials: None, + endpoint_url, + allow_http, + region, + } } } -impl From<&PyStorage> for StorageConfig { - fn from(storage: &PyStorage) -> Self { +fn mk_credentials(config: Option<&PyS3Credentials>, anon: bool) -> S3Credentials { + if anon { + S3Credentials::Anonymous + } else { + match config { + None => S3Credentials::FromEnv, + Some(credentials) => S3Credentials::Static(credentials.into()), + } + } +} + +impl From<&PyStorageConfig> for StorageConfig { + fn from(storage: &PyStorageConfig) -> Self { match storage { - PyStorage::Memory { prefix } => { + PyStorageConfig::Memory { prefix } => { StorageConfig::InMemory { prefix: prefix.clone() } } - PyStorage::Filesystem { root } => { + PyStorageConfig::Filesystem { root } => { StorageConfig::LocalFileSystem { root: PathBuf::from(root.clone()) } } - PyStorage::S3 { bucket, prefix, credentials, endpoint_url } => { + PyStorageConfig::S3 { + bucket, + prefix, + anon, + credentials, + endpoint_url, + allow_http, + region, + } => { + let s3_config = S3Config { + region: region.clone(), + credentials: mk_credentials(credentials.as_ref(), *anon), + endpoint: endpoint_url.clone(), + allow_http: allow_http.unwrap_or(false), + }; + StorageConfig::S3ObjectStore { bucket: bucket.clone(), prefix: prefix.clone(), - credentials: credentials.as_ref().map(S3Credentials::from), - endpoint: endpoint_url.clone(), + config: Some(s3_config), } } } } } + +#[pyclass(name = "VirtualRefConfig")] +#[derive(Clone, Debug)] +pub enum PyVirtualRefConfig { + S3 { + credentials: Option, + endpoint_url: Option, + allow_http: Option, + region: Option, + anon: bool, + }, +} + +#[pymethods] +impl PyVirtualRefConfig { + #[classmethod] + fn s3_from_env(_cls: &Bound<'_, PyType>) -> Self { + PyVirtualRefConfig::S3 { + credentials: None, + endpoint_url: None, + allow_http: None, + region: None, + anon: false, + } + } + + #[classmethod] + fn s3_from_config( + _cls: &Bound<'_, PyType>, + credentials: PyS3Credentials, + endpoint_url: Option, + allow_http: Option, + region: Option, + anon: Option, + ) -> Self { + PyVirtualRefConfig::S3 { + credentials: Some(credentials), + endpoint_url, + allow_http, + region, + anon: anon.unwrap_or(false), + } + } + + #[classmethod] + fn s3_anonymous( + _cls: &Bound<'_, PyType>, + endpoint_url: Option, + allow_http: Option, + region: Option, + ) -> Self { + PyVirtualRefConfig::S3 { + credentials: None, + endpoint_url, + allow_http, + region, + anon: true, + } + } +} + +impl From<&PyVirtualRefConfig> for ObjectStoreVirtualChunkResolverConfig { + fn from(config: &PyVirtualRefConfig) -> Self { + match config { + PyVirtualRefConfig::S3 { + credentials, + endpoint_url, + allow_http, + region, + anon, + } => ObjectStoreVirtualChunkResolverConfig::S3(S3Config { + region: region.clone(), + endpoint: endpoint_url.clone(), + credentials: mk_credentials(credentials.as_ref(), *anon), + allow_http: allow_http.unwrap_or(false), + }), + } + } +} diff --git a/icechunk-python/tests/conftest.py b/icechunk-python/tests/conftest.py index 414ef509..94e352fe 100644 --- a/icechunk-python/tests/conftest.py +++ b/icechunk-python/tests/conftest.py @@ -1,21 +1,21 @@ from typing import Literal -from icechunk import IcechunkStore, Storage import pytest +from icechunk import IcechunkStore, StorageConfig -async def parse_store(store: Literal["local", "memory"], path: str) -> IcechunkStore: +def parse_store(store: Literal["local", "memory"], path: str) -> IcechunkStore: if store == "local": - return await IcechunkStore.create( - storage=Storage.filesystem(path), + return IcechunkStore.create( + storage=StorageConfig.filesystem(path), ) if store == "memory": - return await IcechunkStore.create( - storage=Storage.memory(path), + return IcechunkStore.create( + storage=StorageConfig.memory(path), ) @pytest.fixture(scope="function") -async def store(request: pytest.FixtureRequest, tmpdir: str) -> IcechunkStore: +def store(request: pytest.FixtureRequest, tmpdir: str) -> IcechunkStore: param = request.param - return await parse_store(param, str(tmpdir)) + return parse_store(param, str(tmpdir)) diff --git a/icechunk-python/tests/data/test-repo/chunks/1H3ZMQ27T6XPD5CGK1DG b/icechunk-python/tests/data/test-repo/chunks/1H3ZMQ27T6XPD5CGK1DG new file mode 100644 index 00000000..bf221227 Binary files /dev/null and b/icechunk-python/tests/data/test-repo/chunks/1H3ZMQ27T6XPD5CGK1DG differ diff --git a/icechunk-python/tests/data/test-repo/chunks/EWW1EVYRD0RVW23YZ3N0 b/icechunk-python/tests/data/test-repo/chunks/EWW1EVYRD0RVW23YZ3N0 new file mode 100644 index 00000000..bf221227 Binary files /dev/null and b/icechunk-python/tests/data/test-repo/chunks/EWW1EVYRD0RVW23YZ3N0 differ diff --git a/icechunk-python/tests/data/test-repo/chunks/HDRYBA66N2Z6YEV174D0 b/icechunk-python/tests/data/test-repo/chunks/HDRYBA66N2Z6YEV174D0 new file mode 100644 index 00000000..bf221227 Binary files /dev/null and b/icechunk-python/tests/data/test-repo/chunks/HDRYBA66N2Z6YEV174D0 differ diff --git a/icechunk-python/tests/data/test-repo/chunks/Q7HMN2SYVTRD4YP93780 b/icechunk-python/tests/data/test-repo/chunks/Q7HMN2SYVTRD4YP93780 new file mode 100644 index 00000000..bf221227 Binary files /dev/null and b/icechunk-python/tests/data/test-repo/chunks/Q7HMN2SYVTRD4YP93780 differ diff --git a/icechunk-python/tests/data/test-repo/manifests/BNV9Q6Q9Y9VKYT45MHJG b/icechunk-python/tests/data/test-repo/manifests/BNV9Q6Q9Y9VKYT45MHJG new file mode 100644 index 00000000..4d1ca5ff Binary files /dev/null and b/icechunk-python/tests/data/test-repo/manifests/BNV9Q6Q9Y9VKYT45MHJG differ diff --git a/icechunk-python/tests/data/test-repo/manifests/ME8XN2EDY7P3E2XSR5TG b/icechunk-python/tests/data/test-repo/manifests/ME8XN2EDY7P3E2XSR5TG new file mode 100644 index 00000000..1024a982 Binary files /dev/null and b/icechunk-python/tests/data/test-repo/manifests/ME8XN2EDY7P3E2XSR5TG differ diff --git a/icechunk-python/tests/data/test-repo/manifests/PGFG33J5SJNP38N7NFV0 b/icechunk-python/tests/data/test-repo/manifests/PGFG33J5SJNP38N7NFV0 new file mode 100644 index 00000000..6d01733c Binary files /dev/null and b/icechunk-python/tests/data/test-repo/manifests/PGFG33J5SJNP38N7NFV0 differ diff --git a/icechunk-python/tests/data/test-repo/manifests/Z77TTBHNH7GG72HRTXN0 b/icechunk-python/tests/data/test-repo/manifests/Z77TTBHNH7GG72HRTXN0 new file mode 100644 index 00000000..4d1ca5ff Binary files /dev/null and b/icechunk-python/tests/data/test-repo/manifests/Z77TTBHNH7GG72HRTXN0 differ diff --git a/icechunk-python/tests/data/test-repo/refs/branch.main/ZZZZZZZW.json b/icechunk-python/tests/data/test-repo/refs/branch.main/ZZZZZZZW.json new file mode 100644 index 00000000..d43d4f26 --- /dev/null +++ b/icechunk-python/tests/data/test-repo/refs/branch.main/ZZZZZZZW.json @@ -0,0 +1 @@ +{"snapshot":"RZD9SW6JJZHKA94VY1DG"} \ No newline at end of file diff --git a/icechunk-python/tests/data/test-repo/refs/branch.main/ZZZZZZZX.json b/icechunk-python/tests/data/test-repo/refs/branch.main/ZZZZZZZX.json new file mode 100644 index 00000000..c2f2c9d1 --- /dev/null +++ b/icechunk-python/tests/data/test-repo/refs/branch.main/ZZZZZZZX.json @@ -0,0 +1 @@ +{"snapshot":"F8R612XQGFW9CR08HTN0"} \ No newline at end of file diff --git a/icechunk-python/tests/data/test-repo/refs/branch.main/ZZZZZZZY.json b/icechunk-python/tests/data/test-repo/refs/branch.main/ZZZZZZZY.json new file mode 100644 index 00000000..b4eed8ff --- /dev/null +++ b/icechunk-python/tests/data/test-repo/refs/branch.main/ZZZZZZZY.json @@ -0,0 +1 @@ +{"snapshot":"J4DQH8NAGRYC10YGT7F0"} \ No newline at end of file diff --git a/icechunk-python/tests/data/test-repo/refs/branch.main/ZZZZZZZZ.json b/icechunk-python/tests/data/test-repo/refs/branch.main/ZZZZZZZZ.json new file mode 100644 index 00000000..eabcf636 --- /dev/null +++ b/icechunk-python/tests/data/test-repo/refs/branch.main/ZZZZZZZZ.json @@ -0,0 +1 @@ +{"snapshot":"WT2Z2GQ09G0RTAEQ3D70"} \ No newline at end of file diff --git a/icechunk-python/tests/data/test-repo/refs/branch.my-branch/ZZZZZZZX.json b/icechunk-python/tests/data/test-repo/refs/branch.my-branch/ZZZZZZZX.json new file mode 100644 index 00000000..7b2fdace --- /dev/null +++ b/icechunk-python/tests/data/test-repo/refs/branch.my-branch/ZZZZZZZX.json @@ -0,0 +1 @@ +{"snapshot":"0XEZMR4J7SJ5QBWAYBE0"} \ No newline at end of file diff --git a/icechunk-python/tests/data/test-repo/refs/branch.my-branch/ZZZZZZZY.json b/icechunk-python/tests/data/test-repo/refs/branch.my-branch/ZZZZZZZY.json new file mode 100644 index 00000000..a585a1a2 --- /dev/null +++ b/icechunk-python/tests/data/test-repo/refs/branch.my-branch/ZZZZZZZY.json @@ -0,0 +1 @@ +{"snapshot":"QVY2RGJBE9DX8E525CA0"} \ No newline at end of file diff --git a/icechunk-python/tests/data/test-repo/refs/branch.my-branch/ZZZZZZZZ.json b/icechunk-python/tests/data/test-repo/refs/branch.my-branch/ZZZZZZZZ.json new file mode 100644 index 00000000..d43d4f26 --- /dev/null +++ b/icechunk-python/tests/data/test-repo/refs/branch.my-branch/ZZZZZZZZ.json @@ -0,0 +1 @@ +{"snapshot":"RZD9SW6JJZHKA94VY1DG"} \ No newline at end of file diff --git a/icechunk-python/tests/data/test-repo/refs/tag.it also works!/ref.json b/icechunk-python/tests/data/test-repo/refs/tag.it also works!/ref.json new file mode 100644 index 00000000..7b2fdace --- /dev/null +++ b/icechunk-python/tests/data/test-repo/refs/tag.it also works!/ref.json @@ -0,0 +1 @@ +{"snapshot":"0XEZMR4J7SJ5QBWAYBE0"} \ No newline at end of file diff --git a/icechunk-python/tests/data/test-repo/refs/tag.it works!/ref.json b/icechunk-python/tests/data/test-repo/refs/tag.it works!/ref.json new file mode 100644 index 00000000..a585a1a2 --- /dev/null +++ b/icechunk-python/tests/data/test-repo/refs/tag.it works!/ref.json @@ -0,0 +1 @@ +{"snapshot":"QVY2RGJBE9DX8E525CA0"} \ No newline at end of file diff --git a/icechunk-python/tests/data/test-repo/snapshots/0XEZMR4J7SJ5QBWAYBE0 b/icechunk-python/tests/data/test-repo/snapshots/0XEZMR4J7SJ5QBWAYBE0 new file mode 100644 index 00000000..80723932 Binary files /dev/null and b/icechunk-python/tests/data/test-repo/snapshots/0XEZMR4J7SJ5QBWAYBE0 differ diff --git a/icechunk-python/tests/data/test-repo/snapshots/F8R612XQGFW9CR08HTN0 b/icechunk-python/tests/data/test-repo/snapshots/F8R612XQGFW9CR08HTN0 new file mode 100644 index 00000000..0ecfa07f Binary files /dev/null and b/icechunk-python/tests/data/test-repo/snapshots/F8R612XQGFW9CR08HTN0 differ diff --git a/icechunk-python/tests/data/test-repo/snapshots/J4DQH8NAGRYC10YGT7F0 b/icechunk-python/tests/data/test-repo/snapshots/J4DQH8NAGRYC10YGT7F0 new file mode 100644 index 00000000..156366a3 Binary files /dev/null and b/icechunk-python/tests/data/test-repo/snapshots/J4DQH8NAGRYC10YGT7F0 differ diff --git a/icechunk-python/tests/data/test-repo/snapshots/QVY2RGJBE9DX8E525CA0 b/icechunk-python/tests/data/test-repo/snapshots/QVY2RGJBE9DX8E525CA0 new file mode 100644 index 00000000..8f9fa672 Binary files /dev/null and b/icechunk-python/tests/data/test-repo/snapshots/QVY2RGJBE9DX8E525CA0 differ diff --git a/icechunk-python/tests/data/test-repo/snapshots/RZD9SW6JJZHKA94VY1DG b/icechunk-python/tests/data/test-repo/snapshots/RZD9SW6JJZHKA94VY1DG new file mode 100644 index 00000000..10ee47ac Binary files /dev/null and b/icechunk-python/tests/data/test-repo/snapshots/RZD9SW6JJZHKA94VY1DG differ diff --git a/icechunk-python/tests/data/test-repo/snapshots/WT2Z2GQ09G0RTAEQ3D70 b/icechunk-python/tests/data/test-repo/snapshots/WT2Z2GQ09G0RTAEQ3D70 new file mode 100644 index 00000000..52d1ff26 Binary files /dev/null and b/icechunk-python/tests/data/test-repo/snapshots/WT2Z2GQ09G0RTAEQ3D70 differ diff --git a/icechunk-python/tests/test_can_read_old.py b/icechunk-python/tests/test_can_read_old.py new file mode 100644 index 00000000..4e598f44 --- /dev/null +++ b/icechunk-python/tests/test_can_read_old.py @@ -0,0 +1,227 @@ +"""This test reads a repository generated with an older version of icechunk. + +In this way, we check we maintain read compatibility. The repository lives +in the git repository, as a filesystem store, in the directory icechunk-python/tests/data/test-repo + +If something changes in the on disk format, we probably won't be able to read the repo, the +test will fail and we can avoid breaking user data. + +When new features that impact the stored info are added, or when the on-disk format is +intentionally changed, the repository files must be regenerated. For that, run the current +file as a python script: `python ./tests/test_can_read_old.py`. +""" + +import icechunk as ic +import zarr +from numpy.testing import assert_array_equal +from object_store import ClientOptions, ObjectStore + + +def write_chunks_to_minio(chunks: list[tuple[str, bytes]]): + client_options = ClientOptions( + allow_http=True, # type: ignore + ) + store = ObjectStore( + "s3://testbucket", + { + "access_key_id": "minio123", + "secret_access_key": "minio123", + "aws_region": "us-east-1", + "aws_endpoint": "http://localhost:9000", + }, + client_options=client_options, + ) + + for key, data in chunks: + store.put(key, data) + + +def mk_store(mode): + """Create a store that can access virtual chunks in localhost MinIO""" + store_path = "./tests/data/test-repo" + store = ic.IcechunkStore.open_or_create( + storage=ic.StorageConfig.filesystem(store_path), + config=ic.StoreConfig( + inline_chunk_threshold_bytes=10, + virtual_ref_config=ic.VirtualRefConfig.s3_from_config( + credentials=ic.S3Credentials( + access_key_id="minio123", + secret_access_key="minio123", + ), + endpoint_url="http://localhost:9000", + allow_http=True, + region="us-east-1", + ), + ), + mode=mode, + ) + return store + + +async def write_a_test_repo(): + """Write the test repository. + + This function tries to explore as many icechunk features as possible, to generate + an richer repository on disk. For example, it does several commits, it has a hierarchy, + it has virtual, inline and materialized chunks, branches and tags, etc. + + PLEASE: keep addign more actions to this function as we add more features to Icechunk. + """ + + print("Writing repository to ./tests/data/test-repo") + store = mk_store("w") + + root = zarr.group(store=store) + group1 = root.create_group( + "group1", attributes={"this": "is a nice group", "icechunk": 1, "size": 42.0} + ) + + # these chunks will be materialized + big_chunks = group1.create_array( + "big_chunks", + shape=(10, 10), + chunk_shape=(5, 5), + dtype="float32", + fill_value=float("nan"), + attributes={"this": "is a nice array", "icechunk": 1, "size": 42.0}, + ) + + # these chunks will be inline + small_chunks = group1.create_array( + "small_chunks", + shape=(5), + chunk_shape=(1), + dtype="int8", + fill_value=8, + attributes={"this": "is a nice array", "icechunk": 1, "size": 42.0}, + ) + store.commit("empty structure") + + big_chunks[:] = 42.0 + small_chunks[:] = 84 + store.commit("fill data") + + store.set_virtual_ref( + "group1/big_chunks/c/0/0", + "s3://testbucket/path/to/python/chunk-1", + offset=0, + length=5 * 5 * 4, + ) + store.commit("set virtual chunk") + + store.new_branch("my-branch") + await store.delete("group1/small_chunks/c/4") + snap4 = store.commit("delete a chunk") + + store.tag("it works!", snap4) + + group2 = root.create_group( + "group2", attributes={"this": "is a nice group", "icechunk": 1, "size": 42.0} + ) + group3 = group2.create_group( + "group3", attributes={"this": "is a nice group", "icechunk": 1, "size": 42.0} + ) + group4 = group3.create_group( + "group4", attributes={"this": "is a nice group", "icechunk": 1, "size": 42.0} + ) + group5 = group4.create_group( + "group5", attributes={"this": "is a nice group", "icechunk": 1, "size": 42.0} + ) + group5.create_array( + "inner", + shape=(10, 10), + chunk_shape=(5, 5), + dtype="float32", + fill_value=float("nan"), + attributes={"this": "is a nice array", "icechunk": 1, "size": 42.0}, + ) + snap5 = store.commit("some more structure") + store.tag("it also works!", snap5) + + store.close() + + +async def test_icechunk_can_read_old_repo(): + store = mk_store("r") + + expected_main_history = [ + "set virtual chunk", + "fill data", + "empty structure", + "Repository initialized", + ] + assert [p.message for p in store.ancestry()] == expected_main_history + + store.checkout(branch="my-branch") + expected_branch_history = [ + "some more structure", + "delete a chunk", + ] + expected_main_history + assert [p.message for p in store.ancestry()] == expected_branch_history + + store.checkout(tag="it also works!") + assert [p.message for p in store.ancestry()] == expected_branch_history + + store.checkout(tag="it works!") + assert [p.message for p in store.ancestry()] == expected_branch_history[1:] + + store = mk_store("r") + store.checkout(branch="my-branch") + assert sorted([p async for p in store.list_dir("")]) == [ + "group1", + "group2", + "zarr.json", + ] + assert sorted([p async for p in store.list_dir("group1")]) == [ + "big_chunks", + "small_chunks", + "zarr.json", + ] + assert sorted([p async for p in store.list_dir("group2")]) == ["group3", "zarr.json"] + assert sorted([p async for p in store.list_dir("group2/group3")]) == [ + "group4", + "zarr.json", + ] + assert sorted([p async for p in store.list_dir("group2/group3/group4")]) == [ + "group5", + "zarr.json", + ] + assert sorted([p async for p in store.list_dir("group2/group3/group4/group5")]) == [ + "inner", + "zarr.json", + ] + assert sorted( + [p async for p in store.list_dir("group2/group3/group4/group5/inner")] + ) == ["zarr.json"] + + root = zarr.group(store=store) + # inner is not initialized, so it's all fill values + inner = root["group2/group3/group4/group5/inner"] + assert_array_equal(inner[:], float("nan")) + + small_chunks = root["group1/small_chunks"] + # has 5 elements, we deleted the last chunk (of size 1), and the fill value is 8 + assert_array_equal(small_chunks[:], [84, 84, 84, 84, 8]) + + # big_chunks array has a virtual chunk, so we need to write it to local MinIO + # we get the bytes from one of the materialized chunks + buffer_prototype = zarr.core.buffer.default_buffer_prototype() + chunk_data = ( + await store.get("group1/big_chunks/c/0/1", prototype=buffer_prototype) + ).to_bytes() + + # big chunks array has a virtual chunk pointing here + write_chunks_to_minio( + [ + ("path/to/python/chunk-1", chunk_data), + ] + ) + + big_chunks = root["group1/big_chunks"] + assert_array_equal(big_chunks[:], 42.0) + + +if __name__ == "__main__": + import asyncio + + asyncio.run(write_a_test_repo()) diff --git a/icechunk-python/tests/test_concurrency.py b/icechunk-python/tests/test_concurrency.py index 65af93e3..95504fbf 100644 --- a/icechunk-python/tests/test_concurrency.py +++ b/icechunk-python/tests/test_concurrency.py @@ -1,28 +1,34 @@ import asyncio import random -import zarr import icechunk +import zarr N = 15 + async def write_to_store(array, x, y, barrier): await barrier.wait() - await asyncio.sleep(random.uniform(0,0.5)) - array[x,y] = x*y - #await asyncio.sleep(0) + await asyncio.sleep(random.uniform(0, 0.5)) + array[x, y] = x * y + # await asyncio.sleep(0) + async def read_store(array, x, y, barrier): await barrier.wait() while True: - #print(f"reading {x},{y}") - value = array[x,y] - if value == x*y: + # print(f"reading {x},{y}") + value = array[x, y] + if value == x * y: break - await asyncio.sleep(random.uniform(0,0.1)) + await asyncio.sleep(random.uniform(0, 0.1)) + async def list_store(store, barrier): - expected = set(['zarr.json', 'array/zarr.json'] + [f"array/c/{x}/{y}" for x in range(N) for y in range(N)]) + expected = set( + ["zarr.json", "array/zarr.json"] + + [f"array/c/{x}/{y}" for x in range(N) for y in range(N)] + ) await barrier.wait() while True: current = set([k async for k in store.list_prefix("")]) @@ -31,36 +37,43 @@ async def list_store(store, barrier): current = None await asyncio.sleep(0.1) + async def test_concurrency(): - store = await icechunk.IcechunkStore.from_config( - config={"storage": {"type": "in_memory"}, "repository": {}}, mode="w" + store = icechunk.IcechunkStore.open_or_create( + mode="w", + storage=icechunk.StorageConfig.memory(prefix="concurrency"), ) group = zarr.group(store=store, overwrite=True) - array = group.create_array("array", shape=(N, N), chunk_shape=(1, 1), dtype="f8", fill_value=1e23) + array = group.create_array( + "array", shape=(N, N), chunk_shape=(1, 1), dtype="f8", fill_value=1e23 + ) - barrier = asyncio.Barrier(2*N*N + 1) + barrier = asyncio.Barrier(2 * N * N + 1) async with asyncio.TaskGroup() as tg: _task1 = tg.create_task(list_store(store, barrier), name="listing") for x in range(N): for y in range(N): - _write_task = tg.create_task(read_store(array, x, y, barrier), name=f"read {x},{y}") + _write_task = tg.create_task( + read_store(array, x, y, barrier), name=f"read {x},{y}" + ) for x in range(N): for y in range(N): - _write_task = tg.create_task(write_to_store(array, x, y, barrier), name=f"write {x},{y}") + _write_task = tg.create_task( + write_to_store(array, x, y, barrier), name=f"write {x},{y}" + ) - _res=await store.commit("commit") + _res = store.commit("commit") array = group["array"] assert isinstance(array, zarr.Array) for x in range(N): for y in range(N): - assert array[x,y] == x*y - + assert array[x, y] == x * y # FIXME: add assertions print("done") diff --git a/icechunk-python/tests/test_config.py b/icechunk-python/tests/test_config.py new file mode 100644 index 00000000..5c4bfe93 --- /dev/null +++ b/icechunk-python/tests/test_config.py @@ -0,0 +1,79 @@ +import os + +import icechunk +import pytest +import zarr + + +@pytest.fixture(scope="function") +async def tmp_store(tmpdir): + store_path = f"{tmpdir}" + store = icechunk.IcechunkStore.open_or_create( + storage=icechunk.StorageConfig.filesystem(store_path), + mode="a", + config=icechunk.StoreConfig(inline_chunk_threshold_bytes=5), + ) + + yield store, store_path + + store.close() + + +async def test_no_inline_chunks(tmp_store): + store = tmp_store[0] + store_path = tmp_store[1] + array = zarr.open_array( + store=store, + mode="a", + shape=(10), + dtype="int64", + zarr_format=3, + chunk_shape=(1), + fill_value=-1, + ) + array[:] = 42 + + # Check that the chunks directory was created, since each chunk is 4 bytes and the + # inline_chunk_threshold is 1, we should have 10 chunks in the chunks directory + assert os.path.isdir(f"{store_path}/chunks") + assert len(os.listdir(f"{store_path}/chunks")) == 10 + + +async def test_inline_chunks(tmp_store): + store = tmp_store[0] + store_path = tmp_store[1] + + inline_array = zarr.open_array( + store=store, + mode="a", + path="inline", + shape=(10), + dtype="int32", + zarr_format=3, + chunk_shape=(1), + fill_value=-1, + ) + + inline_array[:] = 9 + + # Check that the chunks directory was not created, since each chunk is 4 bytes and the + # inline_chunk_threshold is 40, we should have no chunks directory + assert not os.path.isdir(f"{store_path}/chunks") + + written_array = zarr.open_array( + store=store, + mode="a", + path="not_inline", + shape=(10), + dtype="int64", + zarr_format=3, + chunk_shape=(1), + fill_value=-1, + ) + + written_array[:] = 3 + + # Check that the chunks directory was not created, since each chunk is 8 bytes and the + # inline_chunk_threshold is 40, we should have 10 chunks in the chunks directory + assert os.path.isdir(f"{store_path}/chunks") + assert len(os.listdir(f"/{store_path}/chunks")) == 10 diff --git a/icechunk-python/tests/test_distributed_writers.py b/icechunk-python/tests/test_distributed_writers.py new file mode 100644 index 00000000..d38bab35 --- /dev/null +++ b/icechunk-python/tests/test_distributed_writers.py @@ -0,0 +1,157 @@ +import asyncio +import time +from dataclasses import dataclass +from typing import cast + +import icechunk +import numpy as np +import zarr +from dask.distributed import Client + + +@dataclass +class Task: + # fixme: useee StorageConfig and StoreConfig once those are pickable + storage_config: dict + store_config: dict + area: tuple[slice, slice] + seed: int + + +# We create a 2-d array with this many chunks along each direction +CHUNKS_PER_DIM = 10 + +# Each chunk is CHUNK_DIM_SIZE x CHUNK_DIM_SIZE floats +CHUNK_DIM_SIZE = 10 + +# We split the writes in tasks, each task does this many chunks +CHUNKS_PER_TASK = 2 + + +def mk_store(mode: str, task: Task): + storage_config = icechunk.StorageConfig.s3_from_config( + **task.storage_config, + credentials=icechunk.S3Credentials( + access_key_id="minio123", + secret_access_key="minio123", + ), + ) + store_config = icechunk.StoreConfig(**task.store_config) + + store = icechunk.IcechunkStore.open_or_create( + storage=storage_config, + mode="a", + config=store_config, + ) + + return store + + +def generate_task_array(task: Task): + np.random.seed(task.seed) + nx = len(range(*task.area[0].indices(1000))) + ny = len(range(*task.area[1].indices(1000))) + return np.random.rand(nx, ny) + + +async def execute_task(task: Task): + store = mk_store("w", task) + + group = zarr.group(store=store, overwrite=False) + array = cast(zarr.Array, group["array"]) + array[task.area] = generate_task_array(task) + return store.change_set_bytes() + + +def run_task(task: Task): + return asyncio.run(execute_task(task)) + + +async def test_distributed_writers(): + """Write to an array using uncoordinated writers, distributed via Dask. + + We create a big array, and then we split into workers, each worker gets + an area, where it writes random data with a known seed. Each worker + returns the bytes for its ChangeSet, then the coordinator (main thread) + does a distributed commit. When done, we open the store again and verify + we can write everything we have written. + """ + + client = Client(n_workers=8) + storage_config = { + "bucket": "testbucket", + "prefix": "python-distributed-writers-test__" + str(time.time()), + "endpoint_url": "http://localhost:9000", + "region": "us-east-1", + "allow_http": True, + } + store_config = {"inline_chunk_threshold_bytes": 5} + + ranges = [ + ( + slice( + x, + min( + x + CHUNKS_PER_TASK * CHUNK_DIM_SIZE, + CHUNK_DIM_SIZE * CHUNKS_PER_DIM, + ), + ), + slice( + y, + min( + y + CHUNKS_PER_TASK * CHUNK_DIM_SIZE, + CHUNK_DIM_SIZE * CHUNKS_PER_DIM, + ), + ), + ) + for x in range( + 0, CHUNK_DIM_SIZE * CHUNKS_PER_DIM, CHUNKS_PER_TASK * CHUNK_DIM_SIZE + ) + for y in range( + 0, CHUNK_DIM_SIZE * CHUNKS_PER_DIM, CHUNKS_PER_TASK * CHUNK_DIM_SIZE + ) + ] + tasks = [ + Task( + storage_config=storage_config, + store_config=store_config, + area=area, + seed=idx, + ) + for idx, area in enumerate(ranges) + ] + store = mk_store("r+", tasks[0]) + group = zarr.group(store=store, overwrite=True) + + n = CHUNKS_PER_DIM * CHUNK_DIM_SIZE + array = group.create_array( + "array", + shape=(n, n), + chunk_shape=(CHUNK_DIM_SIZE, CHUNK_DIM_SIZE), + dtype="f8", + fill_value=float("nan"), + ) + _first_snap = store.commit("array created") + + map_result = client.map(run_task, tasks) + change_sets_bytes = client.gather(map_result) + + # we can use the current store as the commit coordinator, because it doesn't have any pending changes, + # all changes come from the tasks, Icechunk doesn't care about where the changes come from, the only + # important thing is to not count changes twice + commit_res = store.distributed_commit("distributed commit", change_sets_bytes) + assert commit_res + + # Lets open a new store to verify the results + store = mk_store("r", tasks[0]) + all_keys = [key async for key in store.list_prefix("/")] + assert ( + len(all_keys) == 1 + 1 + CHUNKS_PER_DIM * CHUNKS_PER_DIM + ) # group meta + array meta + each chunk + + group = zarr.group(store=store, overwrite=False) + + for task in tasks: + actual = array[task.area] + expected = generate_task_array(task) + np.testing.assert_array_equal(actual, expected) diff --git a/icechunk-python/tests/test_pickle.py b/icechunk-python/tests/test_pickle.py new file mode 100644 index 00000000..40cddefc --- /dev/null +++ b/icechunk-python/tests/test_pickle.py @@ -0,0 +1,63 @@ +import pickle + +import icechunk +import pytest +import zarr +from zarr.storage import LocalStore + + +@pytest.fixture(scope="function") +async def tmp_store(tmpdir): + store_path = f"{tmpdir}" + store = icechunk.IcechunkStore.open_or_create( + storage=icechunk.StorageConfig.filesystem(store_path), + mode="w", + ) + + yield store + + store.close() + + +async def test_pickle(tmp_store): + root = zarr.group(store=tmp_store) + array = root.ones(name="ones", shape=(10, 10), chunks=(5, 5), dtype="float32") + array[:] = 20 + tmp_store.commit("firsttt") + + pickled = pickle.dumps(tmp_store) + + store_loaded = pickle.loads(pickled) + assert store_loaded == tmp_store + + root_loaded = zarr.open_group(store_loaded) + array_loaded = root_loaded["ones"] + + assert type(array_loaded) is zarr.Array + assert array_loaded == array + assert array_loaded[0, 5] == 20 + + +async def test_store_equality(tmpdir, tmp_store): + assert tmp_store == tmp_store + + local_store = await LocalStore.open(f"{tmpdir}/zarr", mode="w") + assert tmp_store != local_store + + store2 = icechunk.IcechunkStore.open_or_create( + storage=icechunk.StorageConfig.memory(prefix="test"), + mode="w", + ) + assert tmp_store != store2 + + store3 = icechunk.IcechunkStore.open_or_create( + storage=icechunk.StorageConfig.filesystem(f"{tmpdir}/test"), + mode="a", + ) + assert tmp_store != store3 + + store4 = icechunk.IcechunkStore.open_or_create( + storage=icechunk.StorageConfig.filesystem(f"{tmpdir}/test"), + mode="a", + ) + assert store3 == store4 diff --git a/icechunk-python/tests/test_timetravel.py b/icechunk-python/tests/test_timetravel.py index fc43b670..6f149139 100644 --- a/icechunk-python/tests/test_timetravel.py +++ b/icechunk-python/tests/test_timetravel.py @@ -1,12 +1,11 @@ -import zarr - import icechunk +import zarr -async def test_timetravel(): - store = await icechunk.IcechunkStore.create( - storage=icechunk.Storage.memory("test"), - config=icechunk.StoreConfig(inline_chunk_threshold=1), +def test_timetravel(): + store = icechunk.IcechunkStore.create( + storage=icechunk.StorageConfig.memory("test"), + config=icechunk.StoreConfig(inline_chunk_threshold_bytes=1), ) group = zarr.group(store=store, overwrite=True) @@ -17,40 +16,40 @@ async def test_timetravel(): air_temp[:, :] = 42 assert air_temp[200, 6] == 42 - snapshot_id = await store.commit("commit 1") + snapshot_id = store.commit("commit 1") air_temp[:, :] = 54 assert air_temp[200, 6] == 54 - new_snapshot_id = await store.commit("commit 2") + new_snapshot_id = store.commit("commit 2") - await store.checkout(snapshot_id=snapshot_id) + store.checkout(snapshot_id=snapshot_id) assert air_temp[200, 6] == 42 - await store.checkout(snapshot_id=new_snapshot_id) + store.checkout(snapshot_id=new_snapshot_id) assert air_temp[200, 6] == 54 - await store.checkout(branch="main") + store.checkout(branch="main") air_temp[:, :] = 76 assert store.has_uncommitted_changes assert store.branch == "main" assert store.snapshot_id == new_snapshot_id - await store.reset() + store.reset() assert not store.has_uncommitted_changes assert air_temp[200, 6] == 54 - await store.new_branch("feature") + store.new_branch("feature") assert store.branch == "feature" air_temp[:, :] = 90 - feature_snapshot_id = await store.commit("commit 3") - await store.tag("v1.0", feature_snapshot_id) + feature_snapshot_id = store.commit("commit 3") + store.tag("v1.0", feature_snapshot_id) - await store.checkout(tag="v1.0") + store.checkout(tag="v1.0") assert store.branch is None assert air_temp[200, 6] == 90 - parents = [p async for p in store.ancestry()] + parents = [p for p in store.ancestry()] assert [snap.message for snap in parents] == [ "commit 3", "commit 2", @@ -59,3 +58,27 @@ async def test_timetravel(): ] assert sorted(parents, key=lambda p: p.written_at) == list(reversed(parents)) assert len(set([snap.id for snap in parents])) == 4 + + +async def test_branch_reset(): + store = icechunk.IcechunkStore.create( + storage=icechunk.StorageConfig.memory("test"), + config=icechunk.StoreConfig(inline_chunk_threshold_bytes=1), + ) + + group = zarr.group(store=store, overwrite=True) + group.create_group("a") + prev_snapshot_id = store.commit("group a") + group.create_group("b") + store.commit("group b") + + keys = {k async for k in store.list()} + assert "a/zarr.json" in keys + assert "b/zarr.json" in keys + + store.reset_branch(prev_snapshot_id) + + keys = {k async for k in store.list()} + assert "a/zarr.json" in keys + assert "b/zarr.json" not in keys + diff --git a/icechunk-python/tests/test_virtual_ref.py b/icechunk-python/tests/test_virtual_ref.py index 88819718..94dee971 100644 --- a/icechunk-python/tests/test_virtual_ref.py +++ b/icechunk-python/tests/test_virtual_ref.py @@ -1,8 +1,16 @@ -from object_store import ClientOptions, ObjectStore -from icechunk import IcechunkStore, Storage, S3Credentials +import numpy as np +import pytest import zarr import zarr.core import zarr.core.buffer +from icechunk import ( + IcechunkStore, + S3Credentials, + StorageConfig, + StoreConfig, + VirtualRefConfig, +) +from object_store import ClientOptions, ObjectStore def write_chunks_to_minio(chunks: list[tuple[str, bytes]]): @@ -24,7 +32,7 @@ def write_chunks_to_minio(chunks: list[tuple[str, bytes]]): store.put(key, data) -async def test_write_virtual_refs(): +async def test_write_minio_virtual_refs(): write_chunks_to_minio( [ ("path/to/python/chunk-1", b"first"), @@ -32,28 +40,35 @@ async def test_write_virtual_refs(): ] ) - # Open the store, the S3 credentials must be set in environment vars for this to work for now - store = await IcechunkStore.open( - storage=Storage.s3_from_credentials( - bucket="testbucket", - prefix="python-virtual-ref", - credentials=S3Credentials( - access_key_id="minio123", - secret_access_key="minio123", - ), - endpoint_url="http://localhost:9000", + # Open the store + store = IcechunkStore.open_or_create( + storage=StorageConfig.memory("virtual"), + mode="w", + config=StoreConfig( + virtual_ref_config=VirtualRefConfig.s3_from_config( + credentials=S3Credentials( + access_key_id="minio123", + secret_access_key="minio123", + ), + endpoint_url="http://localhost:9000", + allow_http=True, + region="us-east-1", + ) ), - mode="r+", ) - array = zarr.Array.create(store, shape=(1, 1, 2), chunk_shape=(1, 1, 1), dtype="i4") + array = zarr.Array.create(store, shape=(1, 1, 3), chunk_shape=(1, 1, 1), dtype="i4") - await store.set_virtual_ref( + store.set_virtual_ref( "c/0/0/0", "s3://testbucket/path/to/python/chunk-1", offset=0, length=4 ) - await store.set_virtual_ref( + store.set_virtual_ref( "c/0/0/1", "s3://testbucket/path/to/python/chunk-2", offset=1, length=4 ) + # we write a ref that simulates a lost chunk + await store.async_set_virtual_ref( + "c/0/0/2", "s3://testbucket/path/to/python/non-existing", offset=1, length=4 + ) buffer_prototype = zarr.core.buffer.default_buffer_prototype() @@ -67,3 +82,46 @@ async def test_write_virtual_refs(): assert array[0, 0, 0] == 1936877926 assert array[0, 0, 1] == 1852793701 + + # fetch uninitialized chunk should be None + assert await store.get("c/0/0/3", prototype=buffer_prototype) is None + + # fetching a virtual ref that disappeared should be an exception + with pytest.raises(ValueError): + # TODO: we should include the key and other info in the exception + await store.get("c/0/0/2", prototype=buffer_prototype) + + _snapshot_id = store.commit("Add virtual refs") + + +async def test_from_s3_public_virtual_refs(tmpdir): + # Open the store, + store = IcechunkStore.open_or_create( + storage=StorageConfig.filesystem(f'{tmpdir}/virtual'), + mode="w", + config=StoreConfig( + virtual_ref_config=VirtualRefConfig.s3_anonymous(region="us-east-1", allow_http=False) + ), + ) + root = zarr.Group.from_store(store=store, zarr_format=3) + depth = root.require_array( + name="depth", shape=((10, )), chunk_shape=((10,)), dtype="float64" + ) + + store.set_virtual_ref( + "depth/c/0", + "s3://noaa-nos-ofs-pds/dbofs/netcdf/202410/dbofs.t00z.20241009.fields.f030.nc", + offset=119339, + length=80 + ) + + nodes = [n async for n in store.list()] + assert "depth/c/0" in nodes + + depth_values = depth[:] + assert len(depth_values) == 10 + actual_values = np.array([-0.95,-0.85,-0.75,-0.65,-0.55,-0.45,-0.35,-0.25,-0.15,-0.05]) + assert np.allclose(depth_values, actual_values) + + + diff --git a/icechunk-python/tests/test_zarr/test_api.py b/icechunk-python/tests/test_zarr/test_api.py index 2bdd193c..5baf8fd7 100644 --- a/icechunk-python/tests/test_zarr/test_api.py +++ b/icechunk-python/tests/test_zarr/test_api.py @@ -1,20 +1,30 @@ import pathlib +from typing import Literal -from icechunk import IcechunkStore import numpy as np import pytest -from numpy.testing import assert_array_equal - import zarr -from zarr import Array, Group +from icechunk import IcechunkStore +from numpy.testing import assert_array_equal +from zarr import Array, Group, group from zarr.abc.store import Store -from zarr.api.synchronous import create, load, open, open_group, save, save_array, save_group +from zarr.api.synchronous import ( + create, + load, + open, + open_group, + save, + save_array, + save_group, +) +from zarr.storage._utils import normalize_path from ..conftest import parse_store + @pytest.fixture(scope="function") async def memory_store() -> IcechunkStore: - return await parse_store("memory", "") + return parse_store("memory", "") def test_create_array(memory_store: Store) -> None: @@ -38,6 +48,21 @@ def test_create_array(memory_store: Store) -> None: assert z.chunks == (40,) +@pytest.mark.parametrize("path", ["foo", "/", "/foo", "///foo/bar"]) +@pytest.mark.parametrize("node_type", ["array", "group"]) +def test_open_normalized_path( + memory_store: IcechunkStore, path: str, node_type: Literal["array", "group"] +) -> None: + node: Group | Array + if node_type == "group": + node = group(store=memory_store, path=path) + elif node_type == "array": + node = create(store=memory_store, path=path, shape=(2,)) + + assert node.path == normalize_path(path) + + + async def test_open_array(memory_store: IcechunkStore) -> None: store = memory_store @@ -47,8 +72,8 @@ async def test_open_array(memory_store: IcechunkStore) -> None: assert z.shape == (100,) # open array, overwrite - # _store_dict wont currently work with IcechunkStore - # TODO: Should it? + # _store_dict won't currently work with IcechunkStore + # TODO: Should it? pytest.xfail("IcechunkStore does not support _store_dict") store._store_dict = {} z = open(store=store, shape=200, mode="w") # mode="w" @@ -58,10 +83,10 @@ async def test_open_array(memory_store: IcechunkStore) -> None: # open array, read-only store_cls = type(store) - # _store_dict wont currently work with IcechunkStore - # TODO: Should it? + # _store_dict won't currently work with IcechunkStore + # TODO: Should it? - ro_store = await store_cls.open(store_dict=store._store_dict, mode="r") + ro_store = store_cls.open(store_dict=store._store_dict, mode="r") z = open(store=ro_store) assert isinstance(z, Array) assert z.shape == (200,) @@ -88,10 +113,10 @@ async def test_open_group(memory_store: IcechunkStore) -> None: # open group, read-only store_cls = type(store) - # _store_dict wont currently work with IcechunkStore - # TODO: Should it? + # _store_dict won't currently work with IcechunkStore + # TODO: Should it? pytest.xfail("IcechunkStore does not support _store_dict") - ro_store = await store_cls.open(store_dict=store._store_dict, mode="r") + ro_store = store_cls.open(store_dict=store._store_dict, mode="r") g = open_group(store=ro_store) assert isinstance(g, Group) # assert g.read_only @@ -125,14 +150,21 @@ def test_open_with_mode_r_plus(tmp_path: pathlib.Path) -> None: # 'r+' means read/write (must exist) with pytest.raises(FileNotFoundError): zarr.open(store=tmp_path, mode="r+") - zarr.ones(store=tmp_path, shape=(3, 3)) + z1 = zarr.ones(store=tmp_path, shape=(3, 3)) + assert z1.fill_value == 1 z2 = zarr.open(store=tmp_path, mode="r+") assert isinstance(z2, Array) + assert z2.fill_value == 1 assert (z2[:] == 1).all() z2[:] = 3 -def test_open_with_mode_a(tmp_path: pathlib.Path) -> None: +async def test_open_with_mode_a(tmp_path: pathlib.Path) -> None: + # Open without shape argument should default to group + g = zarr.open(store=tmp_path, mode="a") + assert isinstance(g, Group) + await g.store_path.delete() + # 'a' means read/write (create if doesn't exist) arr = zarr.open(store=tmp_path, mode="a", shape=(3, 3)) assert isinstance(arr, Array) diff --git a/icechunk-python/tests/test_zarr/test_array.py b/icechunk-python/tests/test_zarr/test_array.py index 3cf10c72..704fcf8e 100644 --- a/icechunk-python/tests/test_zarr/test_array.py +++ b/icechunk-python/tests/test_zarr/test_array.py @@ -1,13 +1,21 @@ -from typing import Literal +import pickle +from itertools import accumulate +from typing import Any, Literal -from icechunk import IcechunkStore import numpy as np import pytest - -from zarr import Array, Group -from zarr.core.common import ZarrFormat +import zarr +import zarr.api +import zarr.api.asynchronous +from icechunk import IcechunkStore +from zarr import Array, AsyncArray, AsyncGroup, Group +from zarr.codecs import BytesCodec, VLenBytesCodec +from zarr.core.array import chunks_initialized +from zarr.core.common import JSON, ZarrFormat +from zarr.core.indexing import ceildiv +from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError -from zarr.store.common import StorePath +from zarr.storage import StorePath @pytest.mark.parametrize("store", ["memory"], indirect=["store"]) @@ -61,12 +69,54 @@ def test_array_creation_existing_node( ) +@pytest.mark.parametrize("store", ["memory"], indirect=["store"]) +@pytest.mark.parametrize("zarr_format", [3]) +async def test_create_creates_parents( + store: IcechunkStore, zarr_format: ZarrFormat +) -> None: + # prepare a root node, with some data set + await zarr.api.asynchronous.open_group( + store=store, path="a", zarr_format=zarr_format, attributes={"key": "value"} + ) + + # create a child node with a couple intermediates + await zarr.api.asynchronous.create( + shape=(2, 2), store=store, path="a/b/c/d", zarr_format=zarr_format + ) + parts = ["a", "a/b", "a/b/c"] + + if zarr_format == 2: + files = [".zattrs", ".zgroup"] + else: + files = ["zarr.json"] + + expected = [f"{part}/{file}" for file in files for part in parts] + + if zarr_format == 2: + expected.extend([".zattrs", ".zgroup", "a/b/c/d/.zarray", "a/b/c/d/.zattrs"]) + else: + expected.extend(["zarr.json", "a/b/c/d/zarr.json"]) + + expected = sorted(expected) + + result = sorted([x async for x in store.list_prefix("")]) + + assert result == expected + + paths = ["a", "a/b", "a/b/c"] + for path in paths: + g = await zarr.api.asynchronous.open_group(store=store, path=path) + assert isinstance(g, AsyncGroup) + + @pytest.mark.parametrize("store", ["memory"], indirect=["store"]) @pytest.mark.parametrize("zarr_format", [3]) def test_array_name_properties_no_group( store: IcechunkStore, zarr_format: ZarrFormat ) -> None: - arr = Array.create(store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4") + arr = Array.create( + store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4" + ) assert arr.path == "" assert arr.name is None assert arr.basename is None @@ -122,9 +172,13 @@ def test_array_v3_fill_value_default( @pytest.mark.parametrize("store", ["memory"], indirect=True) -@pytest.mark.parametrize("fill_value", [False, 0.0, 1, 2.3]) -@pytest.mark.parametrize("dtype_str", ["bool", "uint8", "float32", "complex64"]) -def test_array_v3_fill_value(store: IcechunkStore, fill_value: int, dtype_str: str) -> None: +@pytest.mark.parametrize( + ("dtype_str", "fill_value"), + [("bool", True), ("uint8", 99), ("float32", -99.9), ("complex64", 3 + 4j)], +) +def test_array_v3_fill_value( + store: IcechunkStore, fill_value: int, dtype_str: str +) -> None: shape = (10,) arr = Array.create( store=store, @@ -137,3 +191,197 @@ def test_array_v3_fill_value(store: IcechunkStore, fill_value: int, dtype_str: s assert arr.fill_value == np.dtype(dtype_str).type(fill_value) assert arr.fill_value.dtype == arr.dtype + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +async def test_array_v3_nan_fill_value(store: IcechunkStore) -> None: + shape = (10,) + arr = Array.create( + store=store, + shape=shape, + dtype=np.float64, + zarr_format=3, + chunk_shape=shape, + fill_value=np.nan, + ) + arr[:] = np.nan + + assert np.isnan(arr.fill_value) + assert arr.fill_value.dtype == arr.dtype + # # all fill value chunk is an empty chunk, and should not be written + # assert len([a async for a in store.list_prefix("/")]) == 0 + + +@pytest.mark.parametrize("store", ["local"], indirect=["store"]) +@pytest.mark.parametrize("zarr_format", [3]) +async def test_serializable_async_array( + store: IcechunkStore, zarr_format: ZarrFormat +) -> None: + expected = await AsyncArray.create( + store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4" + ) + # await expected.setitems(list(range(100))) + + p = pickle.dumps(expected) + actual = pickle.loads(p) + + assert actual == expected + # np.testing.assert_array_equal(await actual.getitem(slice(None)), await expected.getitem(slice(None))) + # TODO: uncomment the parts of this test that will be impacted by the config/prototype changes in flight + + +@pytest.mark.parametrize("store", ["local"], indirect=["store"]) +@pytest.mark.parametrize("zarr_format", [3]) +def test_serializable_sync_array(store: IcechunkStore, zarr_format: ZarrFormat) -> None: + expected = Array.create( + store=store, shape=(100,), chunks=(10,), zarr_format=zarr_format, dtype="i4" + ) + expected[:] = list(range(100)) + + p = pickle.dumps(expected) + actual = pickle.loads(p) + + assert actual == expected + np.testing.assert_array_equal(actual[:], expected[:]) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +def test_storage_transformers(store: IcechunkStore) -> None: + """ + Test that providing an actual storage transformer produces a warning and otherwise passes through + """ + metadata_dict: dict[str, JSON] = { + "zarr_format": 3, + "node_type": "array", + "shape": (10,), + "chunk_grid": {"name": "regular", "configuration": {"chunk_shape": (1,)}}, + "data_type": "uint8", + "chunk_key_encoding": {"name": "v2", "configuration": {"separator": "/"}}, + "codecs": (BytesCodec().to_dict(),), + "fill_value": 0, + "storage_transformers": ({"test": "should_raise"}), + } + match = "Arrays with storage transformers are not supported in zarr-python at this time." + with pytest.raises(ValueError, match=match): + Array.from_dict(StorePath(store), data=metadata_dict) + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("test_cls", [Array, AsyncArray[Any]]) +@pytest.mark.parametrize("nchunks", [2, 5, 10]) +def test_nchunks(store: IcechunkStore, test_cls: type[Array] | type[AsyncArray[Any]], nchunks: int) -> None: + """ + Test that nchunks returns the number of chunks defined for the array. + """ + shape = 100 + arr = Array.create(store, shape=(shape,), chunks=(ceildiv(shape, nchunks),), dtype="i4") + expected = nchunks + if test_cls == Array: + observed = arr.nchunks + else: + observed = arr._async_array.nchunks + assert observed == expected + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("test_cls", [Array, AsyncArray[Any]]) +def test_nchunks_initialized(store: IcechunkStore, test_cls: type[Array] | type[AsyncArray[Any]]) -> None: + """ + Test that nchunks_initialized accurately returns the number of stored chunks. + """ + arr = Array.create(store, shape=(100,), chunks=(10,), dtype="i4") + + # write chunks one at a time + for idx, region in enumerate(arr._iter_chunk_regions()): + arr[region] = 1 + expected = idx + 1 + if test_cls == Array: + observed = arr.nchunks_initialized + else: + observed = arr._async_array.nchunks_initialized + assert observed == expected + + # delete chunks + for idx, key in enumerate(arr._iter_chunk_keys()): + sync(arr.store_path.store.delete(key)) + if test_cls == Array: + observed = arr.nchunks_initialized + else: + observed = arr._async_array.nchunks_initialized + expected = arr.nchunks - idx - 1 + assert observed == expected + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +@pytest.mark.parametrize("test_cls", [Array, AsyncArray[Any]]) +def test_chunks_initialized(store: IcechunkStore, test_cls: type[Array] | type[AsyncArray[Any]]) -> None: + """ + Test that chunks_initialized accurately returns the keys of stored chunks. + """ + arr = Array.create(store, shape=(100,), chunks=(10,), dtype="i4") + + chunks_accumulated = tuple( + accumulate(tuple(tuple(v.split(" ")) for v in arr._iter_chunk_keys())) + ) + for keys, region in zip(chunks_accumulated, arr._iter_chunk_regions(), strict=False): + arr[region] = 1 + + if test_cls == Array: + observed = sorted(chunks_initialized(arr)) + else: + observed = sorted(chunks_initialized(arr._async_array)) + + expected = sorted(keys) + assert observed == expected + + +@pytest.mark.parametrize("store", ["memory"], indirect=True) +def test_default_fill_values(store: IcechunkStore) -> None: + root = Group.from_store(store) + + a = root.create(name="u4", shape=5, chunk_shape=5, dtype=" None: + with pytest.raises(ValueError, match="At least one ArrayBytesCodec is required."): + Array.create(store, shape=5, chunk_shape=5, dtype=" None: + # regression test for https://github.com/zarr-developers/zarr-python/issues/2328 + arr = Array.create(store=store, shape=5, chunk_shape=5, dtype="f8", zarr_format=zarr_format) + arr.attrs["foo"] = "bar" + assert arr.attrs["foo"] == "bar" + + arr2 = zarr.open_array(store=store, zarr_format=zarr_format) + assert arr2.attrs["foo"] == "bar" \ No newline at end of file diff --git a/icechunk-python/tests/test_zarr/test_group.py b/icechunk-python/tests/test_zarr/test_group.py index 8e0e545e..6ea932a3 100644 --- a/icechunk-python/tests/test_zarr/test_group.py +++ b/icechunk-python/tests/test_zarr/test_group.py @@ -2,18 +2,19 @@ from typing import TYPE_CHECKING, Any, Literal, cast -from icechunk import IcechunkStore import numpy as np import pytest - +import zarr +import zarr.api +import zarr.api.asynchronous +from icechunk import IcechunkStore from zarr import Array, AsyncArray, AsyncGroup, Group from zarr.core.buffer import default_buffer_prototype -from zarr.core.common import ZarrFormat +from zarr.core.common import JSON, ZarrFormat from zarr.core.group import GroupMetadata from zarr.core.sync import sync from zarr.errors import ContainsArrayError, ContainsGroupError -from zarr.store import StorePath -from zarr.store.common import make_store_path +from zarr.storage import StorePath, make_store_path from ..conftest import parse_store @@ -22,10 +23,12 @@ @pytest.fixture(params=["memory"]) -async def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> IcechunkStore: - result = await parse_store(request.param, str(tmpdir)) +def store(request: pytest.FixtureRequest, tmpdir: LEGACY_PATH) -> IcechunkStore: + result = parse_store(request.param, str(tmpdir)) if not isinstance(result, IcechunkStore): - raise TypeError("Wrong store class returned by test fixture! got " + result + " instead") + raise TypeError( + "Wrong store class returned by test fixture! got " + result + " instead" + ) return result @@ -54,6 +57,63 @@ def test_group_init(store: IcechunkStore, zarr_format: ZarrFormat) -> None: assert group._async_group == agroup +async def test_create_creates_parents( + store: IcechunkStore, zarr_format: ZarrFormat +) -> None: + # prepare a root node, with some data set + await zarr.api.asynchronous.open_group( + store=store, path="a", zarr_format=zarr_format, attributes={"key": "value"} + ) + objs = {x async for x in store.list()} + if zarr_format == 2: + assert objs == {".zgroup", ".zattrs", "a/.zgroup", "a/.zattrs"} + else: + assert objs == {"zarr.json", "a/zarr.json"} + + # test that root group node was created + root = await zarr.api.asynchronous.open_group( + store=store, + ) + + agroup = await root.getitem("a") + assert agroup.attrs == {"key": "value"} + + # create a child node with a couple intermediates + await zarr.api.asynchronous.open_group( + store=store, path="a/b/c/d", zarr_format=zarr_format + ) + parts = ["a", "a/b", "a/b/c"] + + if zarr_format == 2: + files = [".zattrs", ".zgroup"] + else: + files = ["zarr.json"] + + expected = [f"{part}/{file}" for file in files for part in parts] + + if zarr_format == 2: + expected.extend([".zgroup", ".zattrs", "a/b/c/d/.zgroup", "a/b/c/d/.zattrs"]) + else: + expected.extend(["zarr.json", "a/b/c/d/zarr.json"]) + + expected = sorted(expected) + + result = sorted([x async for x in store.list_prefix("")]) + + assert result == expected + + paths = ["a", "a/b", "a/b/c"] + for path in paths: + g = await zarr.api.asynchronous.open_group(store=store, path=path) + assert isinstance(g, AsyncGroup) + + if path == "a": + # ensure we didn't overwrite the root attributes + assert g.attrs == {"key": "value"} + else: + assert g.attrs == {} + + def test_group_name_properties(store: IcechunkStore, zarr_format: ZarrFormat) -> None: """ Test basic properties of groups @@ -81,18 +141,14 @@ def test_group_members(store: IcechunkStore, zarr_format: ZarrFormat) -> None: """ path = "group" - agroup = AsyncGroup( - metadata=GroupMetadata(zarr_format=zarr_format), - store_path=StorePath(store=store, path=path), - ) - group = Group(agroup) + group = Group.from_store(store=store, zarr_format=zarr_format) members_expected: dict[str, Array | Group] = {} members_expected["subgroup"] = group.create_group("subgroup") # make a sub-sub-subgroup, to ensure that the children calculation doesn't go # too deep in the hierarchy - subsubgroup = members_expected["subgroup"].create_group("subsubgroup") # type: ignore - subsubsubgroup = subsubgroup.create_group("subsubsubgroup") # type: ignore + subsubgroup = cast(Group, members_expected["subgroup"]).create_group("subsubgroup") + subsubsubgroup = subsubgroup.create_group("subsubsubgroup") members_expected["subarray"] = group.create_array( "subarray", shape=(100,), dtype="uint8", chunk_shape=(10,), exists_ok=True @@ -103,9 +159,12 @@ def test_group_members(store: IcechunkStore, zarr_format: ZarrFormat) -> None: # the list of children should ignore this object. with pytest.raises(ValueError): sync( - store.set(f"{path}/extra_object-1", default_buffer_prototype().buffer.from_bytes(b"000000")) + store.set( + f"{path}/extra_object-1", + default_buffer_prototype().buffer.from_bytes(b"000000"), + ) ) - + # This is not supported by Icechunk, so we expect an error # zarr-python: add an extra object under a directory-like prefix in the domain of the group. # this creates a directory with a random key in it @@ -142,7 +201,9 @@ def test_group(store: IcechunkStore, zarr_format: ZarrFormat) -> None: Test basic Group routines. """ store_path = StorePath(store) - agroup = AsyncGroup(metadata=GroupMetadata(zarr_format=zarr_format), store_path=store_path) + agroup = AsyncGroup( + metadata=GroupMetadata(zarr_format=zarr_format), store_path=store_path + ) group = Group(agroup) assert agroup.metadata is group.metadata assert agroup.store_path == group.store_path == store_path @@ -186,15 +247,15 @@ def test_group_create( Test that `Group.create` works as expected. """ attributes = {"foo": 100} - group = Group.from_store(store, attributes=attributes, zarr_format=zarr_format, exists_ok=exists_ok) + group = Group.from_store( + store, attributes=attributes, zarr_format=zarr_format, exists_ok=exists_ok + ) assert group.attrs == attributes if not exists_ok: with pytest.raises(ContainsGroupError): - group = Group.from_store( - store, attributes=attributes, exists_ok=exists_ok, zarr_format=zarr_format - ) + _ = Group.from_store(store, exists_ok=exists_ok, zarr_format=zarr_format) def test_group_open( @@ -221,7 +282,9 @@ def test_group_open( new_attrs = {"path": "bar"} if not exists_ok: with pytest.raises(ContainsGroupError): - Group.from_store(store, attributes=attrs, zarr_format=zarr_format, exists_ok=exists_ok) + Group.from_store( + store, attributes=attrs, zarr_format=zarr_format, exists_ok=exists_ok + ) else: group_created_again = Group.from_store( store, attributes=new_attrs, zarr_format=zarr_format, exists_ok=exists_ok @@ -246,6 +309,26 @@ def test_group_getitem(store: IcechunkStore, zarr_format: ZarrFormat) -> None: group["nope"] +def test_group_get_with_default(store: IcechunkStore, zarr_format: ZarrFormat) -> None: + group = Group.from_store(store, zarr_format=zarr_format) + + # default behavior + result = group.get("subgroup") + assert result is None + + # custom default + result = group.get("subgroup", 8) + assert result == 8 + + # now with a group + subgroup = group.require_group("subgroup") + subgroup.attrs["foo"] = "bar" + + result = group.get("subgroup", 8) + result = cast(Group, result) + assert result.attrs["foo"] == "bar" + + def test_group_delitem(store: IcechunkStore, zarr_format: ZarrFormat) -> None: """ Test the `Group.__delitem__` method. @@ -273,8 +356,7 @@ def test_group_iter(store: IcechunkStore, zarr_format: ZarrFormat) -> None: """ group = Group.from_store(store, zarr_format=zarr_format) - with pytest.raises(NotImplementedError): - [x for x in group] # type: ignore + assert list(group) == [] def test_group_len(store: IcechunkStore, zarr_format: ZarrFormat) -> None: @@ -283,8 +365,7 @@ def test_group_len(store: IcechunkStore, zarr_format: ZarrFormat) -> None: """ group = Group.from_store(store, zarr_format=zarr_format) - with pytest.raises(NotImplementedError): - len(group) # type: ignore + assert len(group) == 0 def test_group_setitem(store: IcechunkStore, zarr_format: ZarrFormat) -> None: @@ -412,12 +493,14 @@ def test_group_creation_existing_node( """ spath = StorePath(store) group = Group.from_store(spath, zarr_format=zarr_format) - expected_exception: type[ContainsArrayError] | type[ContainsGroupError] - attributes = {"old": True} + expected_exception: type[ContainsArrayError | ContainsGroupError] + attributes: dict[str, JSON] = {"old": True} if extant_node == "array": expected_exception = ContainsArrayError - _ = group.create_array("extant", shape=(10,), dtype="uint8", attributes=attributes) + _ = group.create_array( + "extant", shape=(10,), dtype="uint8", attributes=attributes + ) elif extant_node == "group": expected_exception = ContainsGroupError _ = group.create_group("extant", attributes=attributes) @@ -462,7 +545,9 @@ async def test_asyncgroup_create( zarr_format=zarr_format, ) - assert agroup.metadata == GroupMetadata(zarr_format=zarr_format, attributes=attributes) + assert agroup.metadata == GroupMetadata( + zarr_format=zarr_format, attributes=attributes + ) assert agroup.store_path == await make_store_path(store) if not exists_ok: @@ -489,7 +574,9 @@ async def test_asyncgroup_create( async def test_asyncgroup_attrs(store: IcechunkStore, zarr_format: ZarrFormat) -> None: attributes = {"foo": 100} - agroup = await AsyncGroup.from_store(store, zarr_format=zarr_format, attributes=attributes) + agroup = await AsyncGroup.from_store( + store, zarr_format=zarr_format, attributes=attributes + ) assert agroup.attrs == agroup.metadata.attributes == attributes @@ -546,9 +633,9 @@ async def test_asyncgroup_open_wrong_format( # should this be async? @pytest.mark.parametrize( "data", - ( + [ {"zarr_format": 3, "node_type": "group", "attributes": {"foo": 100}}, - ), + ], ) def test_asyncgroup_from_dict(store: IcechunkStore, data: dict[str, Any]) -> None: """ @@ -591,7 +678,11 @@ async def test_asyncgroup_delitem(store: IcechunkStore, zarr_format: ZarrFormat) agroup = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) array_name = "sub_array" _ = await agroup.create_array( - name=array_name, shape=(10,), dtype="uint8", chunk_shape=(2,), attributes={"foo": 100} + name=array_name, + shape=(10,), + dtype="uint8", + chunk_shape=(2,), + attributes={"foo": 100}, ) await agroup.delitem(array_name) @@ -650,7 +741,7 @@ async def test_asyncgroup_create_array( shape = (10,) dtype = "uint8" chunk_shape = (4,) - attributes = {"foo": 100} + attributes: dict[str, JSON] = {"foo": 100} sub_node_path = "sub_array" subnode = await agroup.create_array( @@ -669,7 +760,7 @@ async def test_asyncgroup_create_array( assert subnode.dtype == dtype # todo: fix the type annotation of array.metadata.chunk_grid so that we get some autocomplete # here. - assert subnode.metadata.chunk_grid.chunk_shape == chunk_shape + assert subnode.metadata.chunk_grid.chunk_shape == chunk_shape # type: ignore assert subnode.metadata.zarr_format == zarr_format @@ -724,7 +815,9 @@ async def test_group_members_async(store: IcechunkStore) -> None: assert nmembers == 4 # all children - all_children = sorted([x async for x in group.members(max_depth=None)], key=lambda x: x[0]) + all_children = sorted( + [x async for x in group.members(max_depth=None)], key=lambda x: x[0] + ) expected = [ ("a0", a0), ("g0", g0), @@ -807,7 +900,9 @@ async def test_create_dataset(store: IcechunkStore, zarr_format: ZarrFormat) -> async def test_require_array(store: IcechunkStore, zarr_format: ZarrFormat) -> None: root = await AsyncGroup.from_store(store=store, zarr_format=zarr_format) - foo1 = await root.require_array("foo", shape=(10,), dtype="i8", attributes={"foo": 101}) + foo1 = await root.require_array( + "foo", shape=(10,), dtype="i8", attributes={"foo": 101} + ) assert foo1.attrs == {"foo": 101} foo2 = await root.require_array("foo", shape=(10,), dtype="i8") assert foo2.attrs == {"foo": 101} @@ -828,3 +923,15 @@ async def test_require_array(store: IcechunkStore, zarr_format: ZarrFormat) -> N _ = await root.create_group("bar") with pytest.raises(TypeError, match="Incompatible object"): await root.require_array("bar", shape=(10,), dtype="int8") + + +class TestGroupMetadata: + def test_from_dict_extra_fields(self): + data = { + "attributes": {"key": "value"}, + "_nczarr_superblock": {"version": "2.0.0"}, + "zarr_format": 2, + } + result = GroupMetadata.from_dict(data) + expected = GroupMetadata(attributes={"key": "value"}, zarr_format=2) + assert result == expected diff --git a/icechunk-python/tests/test_zarr/test_store/test_core.py b/icechunk-python/tests/test_zarr/test_store/test_core.py index 525bfc55..dd8b0934 100644 --- a/icechunk-python/tests/test_zarr/test_store/test_core.py +++ b/icechunk-python/tests/test_zarr/test_store/test_core.py @@ -1,12 +1,11 @@ from icechunk import IcechunkStore - -from zarr.store.common import make_store_path +from zarr.storage import make_store_path from ...conftest import parse_store async def test_make_store_path() -> None: # Memory store - store = await parse_store("memory", path="") + store = parse_store("memory", path="") store_path = await make_store_path(store) assert isinstance(store_path.store, IcechunkStore) diff --git a/icechunk-python/tests/test_zarr/test_store/test_icechunk_store.py b/icechunk-python/tests/test_zarr/test_store/test_icechunk_store.py index 5a2ed85d..1540a1be 100644 --- a/icechunk-python/tests/test_zarr/test_store/test_icechunk_store.py +++ b/icechunk-python/tests/test_zarr/test_store/test_icechunk_store.py @@ -1,15 +1,16 @@ from __future__ import annotations -from typing import Any -import pytest +from typing import Any, cast +import pytest +from icechunk import IcechunkStore, StorageConfig +from zarr.abc.store import AccessMode from zarr.core.buffer import Buffer, cpu, default_buffer_prototype +from zarr.core.common import AccessModeLiteral +from zarr.core.sync import collect_aiterator from zarr.testing.store import StoreTests -from icechunk import IcechunkStore, Storage - - -DEFAULT_GROUP_METADATA = b'{"zarr_format":3,"node_type":"group","attributes":null}' +DEFAULT_GROUP_METADATA = b'{"zarr_format":3,"node_type":"group"}' ARRAY_METADATA = ( b'{"zarr_format":3,"node_type":"array","attributes":{"foo":42},' b'"shape":[2,2,2],"data_type":"int32","chunk_grid":{"name":"regular","configuration":{"chunk_shape":[1,1,1]}},' @@ -22,11 +23,11 @@ class TestIcechunkStore(StoreTests[IcechunkStore, cpu.Buffer]): store_cls = IcechunkStore buffer_cls = cpu.Buffer - @pytest.mark.xfail(reason="not implemented") - async def test_store_eq(self) -> None: + @pytest.mark.xfail(reason="not implemented", strict=False) + def test_store_eq(self, store: IcechunkStore, store_kwargs: dict[str, Any]) -> None: pass - @pytest.mark.xfail(reason="not implemented") + @pytest.mark.xfail(reason="not implemented", strict=False) async def test_serizalizable_store(self, store) -> None: pass @@ -46,31 +47,45 @@ async def get(self, store: IcechunkStore, key: str) -> Buffer: return self.buffer_cls.from_bytes(result) @pytest.fixture(scope="function", params=[None, True]) - def store_kwargs( - self, request: pytest.FixtureRequest - ) -> dict[str, str | None | dict[str, Buffer]]: + def store_kwargs(self) -> dict[str, Any]: kwargs = { - "storage": Storage.memory(""), - "mode": "r+", + "storage": StorageConfig.memory("store_test"), + "mode": "w", } return kwargs @pytest.fixture(scope="function") - async def store( - self, store_kwargs: str | None | dict[str, Buffer] - ) -> IcechunkStore: - return await IcechunkStore.open(**store_kwargs) + async def store(self, store_kwargs: dict[str, Any]) -> IcechunkStore: + return IcechunkStore.open_or_create(**store_kwargs) @pytest.mark.xfail(reason="Not implemented") def test_store_repr(self, store: IcechunkStore) -> None: super().test_store_repr(store) - async def test_not_writable_store_raises( - self, store_kwargs: dict[str, Any] + @pytest.mark.xfail(reason="Not implemented") + def test_serializable_store(self, store: IcechunkStore) -> None: + super().test_serializable_store(store) + + def test_store_mode(self, store, store_kwargs: dict[str, Any]) -> None: + assert store.mode == AccessMode.from_literal("w") + assert not store.mode.readonly + + @pytest.mark.parametrize("mode", ["r", "r+", "a", "w", "w-"]) + def test_store_open_mode( + self, store_kwargs: dict[str, Any], mode: AccessModeLiteral ) -> None: + store_kwargs["mode"] = mode + try: + store = self.store_cls.open_or_create(**store_kwargs) + assert store._is_open + assert store.mode == AccessMode.from_literal(mode) + except Exception: + assert 'r' in mode + + async def test_not_writable_store_raises(self, store_kwargs: dict[str, Any]) -> None: create_kwargs = {**store_kwargs, "mode": "r"} with pytest.raises(ValueError): - _store = await self.store_cls.open(**create_kwargs) + _store = self.store_cls.open_or_create(**create_kwargs) # TODO # set @@ -98,9 +113,7 @@ async def test_set_many(self, store: IcechunkStore) -> None: ] # icechunk strictly checks metadata? data_buf = [ - self.buffer_cls.from_bytes( - k.encode() if k != "zarr.json" else ARRAY_METADATA - ) + self.buffer_cls.from_bytes(k.encode() if k != "zarr.json" else ARRAY_METADATA) for k in keys ] store_dict = dict(zip(keys, data_buf, strict=True)) @@ -123,7 +136,6 @@ def test_store_supports_partial_writes(self, store: IcechunkStore) -> None: async def test_list_prefix(self, store: IcechunkStore) -> None: assert True - @pytest.mark.xfail(reason="Not implemented") async def test_clear(self, store: IcechunkStore) -> None: await self.set( store, @@ -226,3 +238,90 @@ async def test_get(self, store: IcechunkStore) -> None: result = await store.get("zarr.json", default_buffer_prototype()) assert result is not None assert result.to_bytes() == DEFAULT_GROUP_METADATA + + async def test_get_many(self, store: IcechunkStore) -> None: + """ + Ensure that multiple keys can be retrieved at once with the _get_many method. + """ + await store.set("zarr.json", self.buffer_cls.from_bytes(ARRAY_METADATA)) + + keys = [ + "c/0/0/0", + "c/0/0/1", + "c/0/1/0", + "c/0/1/1", + "c/1/0/0", + "c/1/0/1", + "c/1/1/0", + "c/1/1/1", + ] + values = [bytes(i) for i, _ in enumerate(keys)] + for k, v in zip(keys, values, strict=False): + await self.set(store, k, self.buffer_cls.from_bytes(v)) + observed_buffers = collect_aiterator( + store._get_many( + zip( + keys, + (default_buffer_prototype(),) * len(keys), + (None,) * len(keys), + strict=False, + ) + ) + ) + observed_kvs = sorted(((k, b.to_bytes()) for k, b in observed_buffers)) # type: ignore[union-attr] + expected_kvs = sorted(((k, b) for k, b in zip(keys, values, strict=False))) + assert observed_kvs == expected_kvs + + async def test_with_mode(self, store: IcechunkStore) -> None: + data = b"0000" + await self.set(store, "zarr.json", self.buffer_cls.from_bytes(ARRAY_METADATA)) + await self.set(store, "c/0/0/0", self.buffer_cls.from_bytes(data)) + assert (await self.get(store, "c/0/0/0")).to_bytes() == data + + for mode in ["r", "a"]: + mode = cast(AccessModeLiteral, mode) + clone = store.with_mode(mode) + # await store.close() + await clone._ensure_open() + assert clone.mode == AccessMode.from_literal(mode) + assert isinstance(clone, type(store)) + + # earlier writes are visible + result = await clone.get("c/0/0/0", default_buffer_prototype()) + assert result is not None + assert result.to_bytes() == data + + # writes to original after with_mode is visible + await self.set(store, "c/0/0/1", self.buffer_cls.from_bytes(data)) + result = await clone.get("c/0/0/1", default_buffer_prototype()) + assert result is not None + assert result.to_bytes() == data + + if mode == "a": + # writes to clone is visible in the original + await clone.set("c/0/1/0", self.buffer_cls.from_bytes(data)) + result = await clone.get("c/0/1/0", default_buffer_prototype()) + assert result is not None + assert result.to_bytes() == data + + else: + with pytest.raises(ValueError, match="store error: cannot write"): + await clone.set("c/0/1/0", self.buffer_cls.from_bytes(data)) + + async def test_set_if_not_exists(self, store: IcechunkStore) -> None: + key = "zarr.json" + data_buf = self.buffer_cls.from_bytes(ARRAY_METADATA) + await self.set(store, key, data_buf) + + new = self.buffer_cls.from_bytes(b"1111") + + # no error even though the data is invalid and the metadata exists + await store.set_if_not_exists(key, new) + + result = await store.get(key, default_buffer_prototype()) + assert result == data_buf + + await store.set_if_not_exists("c/0/0/0", new) # no error + + result = await store.get("c/0/0/0", default_buffer_prototype()) + assert result == new diff --git a/icechunk/Cargo.toml b/icechunk/Cargo.toml index 29a42f5d..9aea3ed2 100644 --- a/icechunk/Cargo.toml +++ b/icechunk/Cargo.toml @@ -1,17 +1,24 @@ [package] name = "icechunk" -version = "0.1.0" +version = "0.1.0-alpha.3" +description = "Transactional storage engine for Zarr designed for use on cloud object storage" +readme = "../README.md" +repository = "https://github.com/earth-mover/icechunk" +homepage = "https://github.com/earth-mover/icechunk" +license = "Apache-2.0" +keywords = ["zarr", "xarray", "database"] +categories = ["database", "science", "science::geo"] +authors = ["Earthmover PBC"] edition = "2021" -description = "Icechunk client" -publish = false +publish = true [dependencies] -async-trait = "0.1.82" +async-trait = "0.1.83" bytes = { version = "1.7.2", features = ["serde"] } base64 = "0.22.1" futures = "0.3.30" itertools = "0.13.0" -object_store = { version = "0.11.0", features = ["aws"] } +object_store = { version = "0.11.0" } rand = "0.8.5" thiserror = "1.0.64" serde_json = "1.0.128" @@ -27,11 +34,16 @@ async-recursion = "1.1.1" rmp-serde = "1.3.0" url = "2.5.2" async-stream = "0.3.5" +rmpv = { version = "1.3.0", features = ["serde", "with-serde"] } +aws-sdk-s3 = "1.53.0" +aws-config = "1.5.7" +aws-credential-types = "1.2.1" +typed-path = "0.9.2" [dev-dependencies] pretty_assertions = "1.4.1" proptest-state-machine = "0.3.0" -tempfile = "3.12.0" +tempfile = "3.13.0" [lints] workspace = true diff --git a/icechunk/examples/low_level_dataset.rs b/icechunk/examples/low_level_dataset.rs index 097a07c3..9e6b9e9c 100644 --- a/icechunk/examples/low_level_dataset.rs +++ b/icechunk/examples/low_level_dataset.rs @@ -49,9 +49,9 @@ ds.add_group("/group2".into()).await?; "#, ); - ds.add_group("/".into()).await?; - ds.add_group("/group1".into()).await?; - ds.add_group("/group2".into()).await?; + ds.add_group(Path::root()).await?; + ds.add_group("/group1".try_into().unwrap()).await?; + ds.add_group("/group2".try_into().unwrap()).await?; println!(); print_nodes(&ds).await?; @@ -84,7 +84,7 @@ let zarr_meta1 = ZarrArrayMetadata {{ chunk_key_encoding: ChunkKeyEncoding::Slash, fill_value: FillValue::Int32(0), codecs: Codecs("codec".to_string()), - storage_transformers: Some(StorageTransformers("tranformers".to_string())), + storage_transformers: Some(StorageTransformers("transformers".to_string())), dimension_names: Some(vec![ Some("x".to_string()), Some("y".to_string()), @@ -129,7 +129,7 @@ ds.add_array(array1_path.clone(), zarr_meta1).await?; Some("t".to_string()), ]), }; - let array1_path: Path = "/group1/array1".into(); + let array1_path: Path = "/group1/array1".try_into().unwrap(); ds.add_array(array1_path.clone(), zarr_meta1).await?; println!(); print_nodes(&ds).await?; @@ -292,7 +292,7 @@ async fn print_nodes(ds: &Repository) -> Result<(), StoreError> { format!( "|{:10?}|{:15}|{:10?}\n", node.node_type(), - node.path.to_str().unwrap(), + node.path.to_string(), node.user_attributes, ) }) diff --git a/icechunk/src/change_set.rs b/icechunk/src/change_set.rs new file mode 100644 index 00000000..fa958f0b --- /dev/null +++ b/icechunk/src/change_set.rs @@ -0,0 +1,383 @@ +use std::{ + collections::{HashMap, HashSet}, + iter, + mem::take, +}; + +use itertools::Either; +use serde::{Deserialize, Serialize}; + +use crate::{ + format::{ + manifest::{ChunkInfo, ManifestExtents, ManifestRef}, + snapshot::{NodeData, NodeSnapshot, UserAttributesSnapshot}, + ManifestId, NodeId, + }, + metadata::UserAttributes, + repository::{ChunkIndices, ChunkPayload, Path, RepositoryResult, ZarrArrayMetadata}, +}; + +#[derive(Clone, Debug, PartialEq, Default, Serialize, Deserialize)] +pub struct ChangeSet { + new_groups: HashMap, + new_arrays: HashMap, + updated_arrays: HashMap, + // These paths may point to Arrays or Groups, + // since both Groups and Arrays support UserAttributes + updated_attributes: HashMap>, + // FIXME: issue with too many inline chunks kept in mem + set_chunks: HashMap>>, + deleted_groups: HashSet, + deleted_arrays: HashSet, +} + +impl ChangeSet { + pub fn is_empty(&self) -> bool { + self == &ChangeSet::default() + } + + pub fn add_group(&mut self, path: Path, node_id: NodeId) { + self.new_groups.insert(path, node_id); + } + + pub fn get_group(&self, path: &Path) -> Option<&NodeId> { + self.new_groups.get(path) + } + + pub fn get_array(&self, path: &Path) -> Option<&(NodeId, ZarrArrayMetadata)> { + self.new_arrays.get(path) + } + + pub fn delete_group(&mut self, path: Path, node_id: NodeId) { + self.updated_attributes.remove(&node_id); + match self.new_groups.remove(&path) { + Some(deleted_node_id) => { + // the group was created in this session + // so we delete it directly, no need to flag as deleted + debug_assert!(deleted_node_id == node_id); + self.delete_children(&path); + } + None => { + // it's an old group, we need to flag it as deleted + self.deleted_groups.insert(path); + } + } + } + + fn delete_children(&mut self, path: &Path) { + let groups_to_delete: Vec<_> = self + .new_groups + .iter() + .filter(|(child_path, _)| child_path.starts_with(path)) + .map(|(k, v)| (k.clone(), *v)) + .collect(); + + for (path, node) in groups_to_delete { + self.delete_group(path, node); + } + + let arrays_to_delete: Vec<_> = self + .new_arrays + .iter() + .filter(|(child_path, _)| child_path.starts_with(path)) + .map(|(k, (node, _))| (k.clone(), *node)) + .collect(); + + for (path, node) in arrays_to_delete { + self.delete_array(path, node); + } + } + + pub fn add_array( + &mut self, + path: Path, + node_id: NodeId, + metadata: ZarrArrayMetadata, + ) { + self.new_arrays.insert(path, (node_id, metadata)); + } + + pub fn update_array(&mut self, node_id: NodeId, metadata: ZarrArrayMetadata) { + self.updated_arrays.insert(node_id, metadata); + } + + pub fn delete_array(&mut self, path: Path, node_id: NodeId) { + // if deleting a new array created in this session, just remove the entry + // from new_arrays + let node_and_meta = self.new_arrays.remove(&path); + let is_new_array = node_and_meta.is_some(); + debug_assert!(!is_new_array || node_and_meta.map(|n| n.0) == Some(node_id)); + + self.updated_arrays.remove(&node_id); + self.updated_attributes.remove(&node_id); + self.set_chunks.remove(&node_id); + if !is_new_array { + self.deleted_arrays.insert(path); + } + } + + pub fn is_deleted(&self, path: &Path) -> bool { + self.deleted_groups.contains(path) + || self.deleted_arrays.contains(path) + || path.ancestors().skip(1).any(|parent| self.is_deleted(&parent)) + } + + pub fn has_updated_attributes(&self, node_id: &NodeId) -> bool { + self.updated_attributes.contains_key(node_id) + } + + pub fn get_updated_zarr_metadata( + &self, + node_id: NodeId, + ) -> Option<&ZarrArrayMetadata> { + self.updated_arrays.get(&node_id) + } + + pub fn update_user_attributes( + &mut self, + node_id: NodeId, + atts: Option, + ) { + self.updated_attributes.insert(node_id, atts); + } + + pub fn get_user_attributes( + &self, + node_id: NodeId, + ) -> Option<&Option> { + self.updated_attributes.get(&node_id) + } + + pub fn set_chunk_ref( + &mut self, + node_id: NodeId, + coord: ChunkIndices, + data: Option, + ) { + // this implementation makes delete idempotent + // it allows deleting a deleted chunk by repeatedly setting None. + self.set_chunks + .entry(node_id) + .and_modify(|h| { + h.insert(coord.clone(), data.clone()); + }) + .or_insert(HashMap::from([(coord, data)])); + } + + pub fn get_chunk_ref( + &self, + node_id: NodeId, + coords: &ChunkIndices, + ) -> Option<&Option> { + self.set_chunks.get(&node_id).and_then(|h| h.get(coords)) + } + + pub fn array_chunks_iterator( + &self, + node_id: NodeId, + node_path: &Path, + ) -> impl Iterator)> { + if self.is_deleted(node_path) { + return Either::Left(iter::empty()); + } + match self.set_chunks.get(&node_id) { + None => Either::Left(iter::empty()), + Some(h) => Either::Right(h.iter()), + } + } + + pub fn new_arrays_chunk_iterator( + &self, + ) -> impl Iterator + '_ { + self.new_arrays.iter().flat_map(|(path, (node_id, _))| { + self.array_chunks_iterator(*node_id, path).filter_map(|(coords, payload)| { + payload.as_ref().map(|p| { + ( + path.clone(), + ChunkInfo { + node: *node_id, + coord: coords.clone(), + payload: p.clone(), + }, + ) + }) + }) + }) + } + + pub fn new_nodes(&self) -> impl Iterator { + self.new_groups.keys().chain(self.new_arrays.keys()) + } + + pub fn take_chunks( + &mut self, + ) -> HashMap>> { + take(&mut self.set_chunks) + } + + pub fn set_chunks( + &mut self, + chunks: HashMap>>, + ) { + self.set_chunks = chunks + } + + /// Merge this ChangeSet with `other`. + /// + /// Results of the merge are applied to `self`. Changes present in `other` take precedence over + /// `self` changes. + pub fn merge(&mut self, other: ChangeSet) { + // FIXME: this should detect conflict, for example, if different writers added on the same + // path, different objects, or if the same path is added and deleted, etc. + // TODO: optimize + self.new_groups.extend(other.new_groups); + self.new_arrays.extend(other.new_arrays); + self.updated_arrays.extend(other.updated_arrays); + self.updated_attributes.extend(other.updated_attributes); + self.deleted_groups.extend(other.deleted_groups); + self.deleted_arrays.extend(other.deleted_arrays); + + for (node, other_chunks) in other.set_chunks.into_iter() { + match self.set_chunks.remove(&node) { + Some(mut old_value) => { + old_value.extend(other_chunks); + self.set_chunks.insert(node, old_value); + } + None => { + self.set_chunks.insert(node, other_chunks); + } + } + } + } + + pub fn merge_many>(&mut self, others: T) { + others.into_iter().fold(self, |res, change_set| { + res.merge(change_set); + res + }); + } + + /// Serialize this ChangeSet + /// + /// This is intended to help with marshalling distributed writers back to the coordinator + pub fn export_to_bytes(&self) -> RepositoryResult> { + Ok(rmp_serde::to_vec(self)?) + } + + /// Deserialize a ChangeSet + /// + /// This is intended to help with marshalling distributed writers back to the coordinator + pub fn import_from_bytes(bytes: &[u8]) -> RepositoryResult { + Ok(rmp_serde::from_slice(bytes)?) + } + + pub fn update_existing_chunks<'a>( + &'a self, + node: NodeId, + chunks: impl Iterator + 'a, + ) -> impl Iterator + 'a { + chunks.filter_map(move |chunk| match self.get_chunk_ref(node, &chunk.coord) { + None => Some(chunk), + Some(new_payload) => { + new_payload.clone().map(|pl| ChunkInfo { payload: pl, ..chunk }) + } + }) + } + + pub fn get_new_node(&self, path: &Path) -> Option { + self.get_new_array(path).or(self.get_new_group(path)) + } + + pub fn get_new_array(&self, path: &Path) -> Option { + self.get_array(path).map(|(id, meta)| { + let meta = self.get_updated_zarr_metadata(*id).unwrap_or(meta).clone(); + let atts = self.get_user_attributes(*id).cloned(); + NodeSnapshot { + id: *id, + path: path.clone(), + user_attributes: atts.flatten().map(UserAttributesSnapshot::Inline), + // We put no manifests in new arrays, see get_chunk_ref to understand how chunks get + // fetched for those arrays + node_data: NodeData::Array(meta.clone(), vec![]), + } + }) + } + + pub fn get_new_group(&self, path: &Path) -> Option { + self.get_group(path).map(|id| { + let atts = self.get_user_attributes(*id).cloned(); + NodeSnapshot { + id: *id, + path: path.clone(), + user_attributes: atts.flatten().map(UserAttributesSnapshot::Inline), + node_data: NodeData::Group, + } + }) + } + + pub fn new_nodes_iterator<'a>( + &'a self, + manifest_id: Option<&'a ManifestId>, + ) -> impl Iterator + 'a { + self.new_nodes().filter_map(move |path| { + if self.is_deleted(path) { + return None; + } + // we should be able to create the full node because we + // know it's a new node + #[allow(clippy::expect_used)] + let node = self.get_new_node(path).expect("Bug in new_nodes implementation"); + match node.node_data { + NodeData::Group => Some(node), + NodeData::Array(meta, _no_manifests_yet) => { + let new_manifests = manifest_id + .map(|mid| { + vec![ManifestRef { + object_id: mid.clone(), + extents: ManifestExtents(vec![]), + }] + }) + .unwrap_or_default(); + Some(NodeSnapshot { + node_data: NodeData::Array(meta, new_manifests), + ..node + }) + } + } + }) + } + + pub fn update_existing_node( + &self, + node: NodeSnapshot, + new_manifests: Option>, + ) -> Option { + if self.is_deleted(&node.path) { + return None; + } + + let session_atts = self + .get_user_attributes(node.id) + .cloned() + .map(|a| a.map(UserAttributesSnapshot::Inline)); + let new_atts = session_atts.unwrap_or(node.user_attributes); + match node.node_data { + NodeData::Group => Some(NodeSnapshot { user_attributes: new_atts, ..node }), + NodeData::Array(old_zarr_meta, _) => { + let new_zarr_meta = self + .get_updated_zarr_metadata(node.id) + .cloned() + .unwrap_or(old_zarr_meta); + + Some(NodeSnapshot { + node_data: NodeData::Array( + new_zarr_meta, + new_manifests.unwrap_or_default(), + ), + user_attributes: new_atts, + ..node + }) + } + } + } +} diff --git a/icechunk/src/format/manifest.rs b/icechunk/src/format/manifest.rs index ef7862a1..7d76be79 100644 --- a/icechunk/src/format/manifest.rs +++ b/icechunk/src/format/manifest.rs @@ -1,3 +1,4 @@ +use futures::{pin_mut, Stream, TryStreamExt}; use itertools::Itertools; use std::{collections::BTreeMap, ops::Bound, sync::Arc}; use thiserror::Error; @@ -6,8 +7,8 @@ use bytes::Bytes; use serde::{Deserialize, Serialize}; use super::{ - ChunkId, ChunkIndices, ChunkLength, ChunkOffset, Flags, IcechunkFormatError, - IcechunkResult, ManifestId, NodeId, + format_constants, ChunkId, ChunkIndices, ChunkLength, ChunkOffset, + IcechunkFormatError, IcechunkFormatVersion, IcechunkResult, ManifestId, NodeId, }; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -16,11 +17,11 @@ pub struct ManifestExtents(pub Vec); #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct ManifestRef { pub object_id: ManifestId, - pub flags: Flags, pub extents: ManifestExtents, } #[derive(Debug, Error)] +#[non_exhaustive] pub enum VirtualReferenceError { #[error("error parsing virtual ref URL {0}")] CannotParseUrl(#[from] url::ParseError), @@ -37,6 +38,7 @@ pub enum VirtualReferenceError { } #[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] +#[non_exhaustive] pub enum VirtualChunkLocation { Absolute(String), // Relative(prefix_id, String) @@ -49,6 +51,7 @@ impl VirtualChunkLocation { // make sure we can parse the provided URL before creating the enum // TODO: consider other validation here. let url = url::Url::parse(path)?; + let scheme = url.scheme(); let new_path: String = url .path_segments() .ok_or(VirtualReferenceError::NoPathSegments(path.into()))? @@ -56,15 +59,17 @@ impl VirtualChunkLocation { .filter(|x| !x.is_empty()) .join("/"); - let host = url - .host() - .ok_or_else(|| VirtualReferenceError::CannotParseBucketName(path.into()))?; - Ok(VirtualChunkLocation::Absolute(format!( - "{}://{}/{}", - url.scheme(), - host, - new_path, - ))) + let host = if let Some(host) = url.host() { + host.to_string() + } else if scheme == "file" { + "".to_string() + } else { + return Err(VirtualReferenceError::CannotParseBucketName(path.into())); + }; + + let location = format!("{}://{}/{}", scheme, host, new_path,); + + Ok(VirtualChunkLocation::Absolute(location)) } } @@ -83,8 +88,9 @@ pub struct ChunkRef { } #[derive(Clone, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)] +#[non_exhaustive] pub enum ChunkPayload { - Inline(Bytes), // FIXME: optimize copies + Inline(Bytes), Virtual(VirtualChunkRef), Ref(ChunkRef), } @@ -98,7 +104,9 @@ pub struct ChunkInfo { #[derive(Debug, PartialEq, Serialize, Deserialize, Default)] pub struct Manifest { - pub chunks: BTreeMap<(NodeId, ChunkIndices), ChunkPayload>, + pub icechunk_manifest_format_version: IcechunkFormatVersion, + pub icechunk_manifest_format_flags: BTreeMap, + chunks: BTreeMap<(NodeId, ChunkIndices), ChunkPayload>, } impl Manifest { @@ -119,6 +127,39 @@ impl Manifest { ) -> impl Iterator { PayloadIterator { manifest: self, for_node: *node, last_key: None } } + + pub fn new(chunks: BTreeMap<(NodeId, ChunkIndices), ChunkPayload>) -> Self { + Self { + chunks, + icechunk_manifest_format_version: + format_constants::LATEST_ICECHUNK_MANIFEST_FORMAT, + icechunk_manifest_format_flags: Default::default(), + } + } + + pub async fn from_stream( + chunks: impl Stream>, + ) -> Result { + let mut chunk_map = BTreeMap::new(); + pin_mut!(chunks); + while let Some(chunk) = chunks.try_next().await? { + chunk_map.insert((chunk.node, chunk.coord), chunk.payload); + } + Ok(Self::new(chunk_map)) + } + + pub fn chunks(&self) -> &BTreeMap<(NodeId, ChunkIndices), ChunkPayload> { + &self.chunks + } + + pub fn len(&self) -> usize { + self.chunks.len() + } + + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } } impl FromIterator for Manifest { @@ -127,7 +168,7 @@ impl FromIterator for Manifest { .into_iter() .map(|chunk| ((chunk.node, chunk.coord), chunk.payload)) .collect(); - Manifest { chunks } + Self::new(chunks) } } diff --git a/icechunk/src/format/mod.rs b/icechunk/src/format/mod.rs index 6e53d840..361fe164 100644 --- a/icechunk/src/format/mod.rs +++ b/icechunk/src/format/mod.rs @@ -3,26 +3,29 @@ use std::{ fmt::{Debug, Display}, hash::Hash, marker::PhantomData, - ops::Bound, - path::PathBuf, + ops::Range, }; use bytes::Bytes; use itertools::Itertools; use rand::{thread_rng, Rng}; use serde::{Deserialize, Deserializer, Serialize}; +use serde_with::{serde_as, TryFromInto}; use thiserror::Error; +use typed_path::Utf8UnixPathBuf; -use crate::metadata::DataType; +use crate::{metadata::DataType, private}; pub mod attributes; pub mod manifest; pub mod snapshot; -pub type Path = PathBuf; +#[serde_as] +#[derive(Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Clone, Serialize, Deserialize)] +pub struct Path(#[serde_as(as = "TryFromInto")] Utf8UnixPathBuf); #[allow(dead_code)] -pub trait FileTypeTag {} +pub trait FileTypeTag: private::Sealed {} /// The id of a file in object store #[derive(Hash, Clone, PartialEq, Eq, PartialOrd, Ord)] @@ -40,6 +43,10 @@ pub struct ChunkTag; #[derive(Hash, Clone, PartialEq, Eq, PartialOrd, Ord)] pub struct AttributesTag; +impl private::Sealed for SnapshotTag {} +impl private::Sealed for ManifestTag {} +impl private::Sealed for ChunkTag {} +impl private::Sealed for AttributesTag {} impl FileTypeTag for SnapshotTag {} impl FileTypeTag for ManifestTag {} impl FileTypeTag for ChunkTag {} @@ -131,60 +138,47 @@ pub type ChunkOffset = u64; pub type ChunkLength = u64; #[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct ByteRange(pub Bound, pub Bound); +pub enum ByteRange { + /// The fixed length range represented by the given `Range` + Bounded(Range), + /// All bytes from the given offset (included) to the end of the object + From(ChunkOffset), + /// Last n bytes in the object + Last(ChunkLength), +} + +impl From> for ByteRange { + fn from(value: Range) -> Self { + ByteRange::Bounded(value) + } +} impl ByteRange { pub fn from_offset(offset: ChunkOffset) -> Self { - Self(Bound::Included(offset), Bound::Unbounded) + Self::From(offset) } pub fn from_offset_with_length(offset: ChunkOffset, length: ChunkOffset) -> Self { - Self(Bound::Included(offset), Bound::Excluded(offset + length)) + Self::Bounded(offset..offset + length) } pub fn to_offset(offset: ChunkOffset) -> Self { - Self(Bound::Unbounded, Bound::Excluded(offset)) + Self::Bounded(0..offset) } pub fn bounded(start: ChunkOffset, end: ChunkOffset) -> Self { - Self(Bound::Included(start), Bound::Excluded(end)) - } - - pub fn length(&self) -> Option { - match (self.0, self.1) { - (_, Bound::Unbounded) => None, - (Bound::Unbounded, Bound::Excluded(end)) => Some(end), - (Bound::Unbounded, Bound::Included(end)) => Some(end + 1), - (Bound::Included(start), Bound::Excluded(end)) => Some(end - start), - (Bound::Excluded(start), Bound::Included(end)) => Some(end - start), - (Bound::Included(start), Bound::Included(end)) => Some(end - start + 1), - (Bound::Excluded(start), Bound::Excluded(end)) => Some(end - start - 1), - } + (start..end).into() } - pub const ALL: Self = Self(Bound::Unbounded, Bound::Unbounded); + pub const ALL: Self = Self::From(0); pub fn slice(&self, bytes: Bytes) -> Bytes { - match (self.0, self.1) { - (Bound::Included(start), Bound::Excluded(end)) => { - bytes.slice(start as usize..end as usize) - } - (Bound::Included(start), Bound::Unbounded) => bytes.slice(start as usize..), - (Bound::Unbounded, Bound::Excluded(end)) => bytes.slice(..end as usize), - (Bound::Excluded(start), Bound::Excluded(end)) => { - bytes.slice(start as usize + 1..end as usize) + match self { + ByteRange::Bounded(range) => { + bytes.slice(range.start as usize..range.end as usize) } - (Bound::Excluded(start), Bound::Unbounded) => { - bytes.slice(start as usize + 1..) - } - (Bound::Unbounded, Bound::Included(end)) => bytes.slice(..=end as usize), - (Bound::Included(start), Bound::Included(end)) => { - bytes.slice(start as usize..=end as usize) - } - (Bound::Excluded(start), Bound::Included(end)) => { - bytes.slice(start as usize + 1..=end as usize) - } - (Bound::Unbounded, Bound::Unbounded) => bytes, + ByteRange::From(from) => bytes.slice(*from as usize..), + ByteRange::Last(n) => bytes.slice(bytes.len() - *n as usize..bytes.len()), } } } @@ -192,22 +186,18 @@ impl ByteRange { impl From<(Option, Option)> for ByteRange { fn from((start, end): (Option, Option)) -> Self { match (start, end) { - (Some(start), Some(end)) => { - Self(Bound::Included(start), Bound::Excluded(end)) - } - (Some(start), None) => Self(Bound::Included(start), Bound::Unbounded), - (None, Some(end)) => Self(Bound::Unbounded, Bound::Excluded(end)), - (None, None) => Self(Bound::Unbounded, Bound::Unbounded), + (Some(start), Some(end)) => Self::Bounded(start..end), + (Some(start), None) => Self::From(start), + (None, Some(end)) => Self::Bounded(0..end), + (None, None) => Self::ALL, } } } pub type TableOffset = u32; -#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] -pub struct Flags(); // FIXME: implement - #[derive(Debug, Clone, Error, PartialEq, Eq)] +#[non_exhaustive] pub enum IcechunkFormatError { #[error("error decoding fill_value from array")] FillValueDecodeError { found_size: usize, target_size: usize, target_type: DataType }, @@ -221,6 +211,85 @@ pub enum IcechunkFormatError { pub type IcechunkResult = Result; +type IcechunkFormatVersion = u16; + +pub mod format_constants { + use super::IcechunkFormatVersion; + + pub const LATEST_ICECHUNK_MANIFEST_FORMAT: IcechunkFormatVersion = 0; + pub const LATEST_ICECHUNK_MANIFEST_CONTENT_TYPE: &str = "application/msgpack"; + pub const LATEST_ICECHUNK_MANIFEST_VERSION_METADATA_KEY: &str = "ic-man-fmt-ver"; + + pub const LATEST_ICECHUNK_SNAPSHOT_FORMAT: IcechunkFormatVersion = 0; + pub const LATEST_ICECHUNK_SNAPSHOT_CONTENT_TYPE: &str = "application/msgpack"; + pub const LATEST_ICECHUNK_SNAPSHOT_VERSION_METADATA_KEY: &str = "ic-sna-fmt-ver"; +} + +impl Display for Path { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +#[derive(Debug, Clone, Error, PartialEq, Eq)] +#[non_exhaustive] +pub enum PathError { + #[error("path must start with a `/` character")] + NotAbsolute, + #[error(r#"path must be cannonic, cannot include "." or "..""#)] + NotCanonic, +} + +impl Path { + pub fn root() -> Path { + Path(Utf8UnixPathBuf::from("/".to_string())) + } + + pub fn new(path: &str) -> Result { + let buf = Utf8UnixPathBuf::from(path); + if !buf.is_absolute() { + return Err(PathError::NotAbsolute); + } + + if buf.normalize() != buf { + return Err(PathError::NotCanonic); + } + Ok(Path(buf)) + } + + pub fn starts_with(&self, other: &Path) -> bool { + self.0.starts_with(&other.0) + } + + pub fn ancestors(&self) -> impl Iterator + '_ { + self.0.ancestors().map(|p| Path(p.to_owned())) + } +} + +impl TryFrom<&str> for Path { + type Error = PathError; + + fn try_from(value: &str) -> Result { + Self::new(value) + } +} + +impl TryFrom<&String> for Path { + type Error = PathError; + + fn try_from(value: &String) -> Result { + value.as_str().try_into() + } +} + +impl TryFrom for Path { + type Error = PathError; + + fn try_from(value: String) -> Result { + value.as_str().try_into() + } +} + #[cfg(test)] #[allow(clippy::panic, clippy::unwrap_used, clippy::expect_used)] mod tests { diff --git a/icechunk/src/format/snapshot.rs b/icechunk/src/format/snapshot.rs index 4f386be2..8fd4f009 100644 --- a/icechunk/src/format/snapshot.rs +++ b/icechunk/src/format/snapshot.rs @@ -14,15 +14,15 @@ use crate::metadata::{ }; use super::{ - manifest::ManifestRef, AttributesId, Flags, IcechunkFormatError, IcechunkResult, - NodeId, ObjectId, Path, SnapshotId, TableOffset, + format_constants, manifest::ManifestRef, AttributesId, IcechunkFormatError, + IcechunkFormatVersion, IcechunkResult, ManifestId, NodeId, ObjectId, Path, + SnapshotId, TableOffset, }; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] pub struct UserAttributesRef { pub object_id: AttributesId, pub location: TableOffset, - pub flags: Flags, } #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -81,8 +81,26 @@ pub struct SnapshotMetadata { pub type SnapshotProperties = HashMap; +#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)] +pub struct ManifestFileInfo { + pub id: ManifestId, + pub format_version: IcechunkFormatVersion, +} + +#[derive(Debug, PartialEq, Serialize, Deserialize, Clone)] +pub struct AttributeFileInfo { + pub id: AttributesId, + pub format_version: IcechunkFormatVersion, +} + #[derive(Debug, PartialEq, Serialize, Deserialize)] pub struct Snapshot { + pub icechunk_snapshot_format_version: IcechunkFormatVersion, + pub icechunk_snapshot_format_flags: BTreeMap, + + pub manifest_files: Vec, + pub attribute_files: Vec, + pub total_parents: u32, // we denormalize this field to have it easily available in the serialized file pub short_term_parents: u16, @@ -91,7 +109,7 @@ pub struct Snapshot { pub metadata: SnapshotMetadata, pub started_at: DateTime, pub properties: SnapshotProperties, - pub nodes: BTreeMap, + nodes: BTreeMap, } impl Default for SnapshotMetadata { @@ -113,17 +131,24 @@ impl SnapshotMetadata { impl Snapshot { pub const INITIAL_COMMIT_MESSAGE: &'static str = "Repository initialized"; - pub fn new( + fn new( short_term_history: VecDeque, total_parents: u32, properties: Option, + nodes: BTreeMap, + manifest_files: Vec, + attribute_files: Vec, ) -> Self { let metadata = SnapshotMetadata::default(); let short_term_parents = short_term_history.len() as u16; let started_at = Utc::now(); let properties = properties.unwrap_or_default(); - let nodes = BTreeMap::new(); Self { + icechunk_snapshot_format_version: + format_constants::LATEST_ICECHUNK_SNAPSHOT_FORMAT, + icechunk_snapshot_format_flags: Default::default(), + manifest_files, + attribute_files, total_parents, short_term_parents, short_term_history, @@ -135,50 +160,33 @@ impl Snapshot { } pub fn from_iter>( - short_term_history: VecDeque, - total_parents: u32, + parent: &Snapshot, properties: Option, + manifest_files: Vec, + attribute_files: Vec, iter: T, ) -> Self { let nodes = iter.into_iter().map(|node| (node.path.clone(), node)).collect(); - Self { nodes, ..Self::new(short_term_history, total_parents, properties) } - } - - pub fn first(properties: Option) -> Self { - Self::new(VecDeque::new(), 0, properties) - } - - pub fn first_from_iter>( - properties: Option, - iter: T, - ) -> Self { - Self::from_iter(VecDeque::new(), 0, properties, iter) - } - - pub fn from_parent( - parent: &Snapshot, - properties: Option, - ) -> Self { let mut history = parent.short_term_history.clone(); history.push_front(parent.metadata.clone()); - Self::new(history, parent.total_parents + 1, properties) - } - pub fn child_from_iter>( - parent: &Snapshot, - properties: Option, - iter: T, - ) -> Self { - let mut res = Self::from_parent(parent, properties); - let with_nodes = Self::first_from_iter(None, iter); - res.nodes = with_nodes.nodes; - res + Self::new( + history, + parent.total_parents + 1, + properties, + nodes, + manifest_files, + attribute_files, + ) } pub fn empty() -> Self { let metadata = SnapshotMetadata::with_message(Self::INITIAL_COMMIT_MESSAGE.to_string()); - Self { metadata, ..Self::first(None) } + Self { + metadata, + ..Self::new(VecDeque::new(), 0, None, Default::default(), vec![], vec![]) + } } pub fn get_node(&self, path: &Path) -> IcechunkResult<&NodeSnapshot> { @@ -199,6 +207,15 @@ impl Snapshot { (0..self.short_term_history.len()) .map(move |ix| self.short_term_history[ix].clone()) } + + pub fn len(&self) -> usize { + self.nodes.len() + } + + #[must_use] + pub fn is_empty(&self) -> bool { + self.len() == 0 + } } // We need this complex dance because Rust makes it really hard to put together an object and a @@ -295,37 +312,35 @@ mod tests { ZarrArrayMetadata { dimension_names: None, ..zarr_meta2.clone() }; let man_ref1 = ManifestRef { object_id: ObjectId::random(), - flags: Flags(), extents: ManifestExtents(vec![]), }; let man_ref2 = ManifestRef { object_id: ObjectId::random(), - flags: Flags(), extents: ManifestExtents(vec![]), }; let oid = ObjectId::random(); let nodes = vec![ NodeSnapshot { - path: "/".into(), + path: Path::root(), id: 1, user_attributes: None, node_data: NodeData::Group, }, NodeSnapshot { - path: "/a".into(), + path: "/a".try_into().unwrap(), id: 2, user_attributes: None, node_data: NodeData::Group, }, NodeSnapshot { - path: "/b".into(), + path: "/b".try_into().unwrap(), id: 3, user_attributes: None, node_data: NodeData::Group, }, NodeSnapshot { - path: "/b/c".into(), + path: "/b/c".try_into().unwrap(), id: 4, user_attributes: Some(UserAttributesSnapshot::Inline( UserAttributes::try_new(br#"{"foo": "some inline"}"#).unwrap(), @@ -333,12 +348,11 @@ mod tests { node_data: NodeData::Group, }, NodeSnapshot { - path: "/b/array1".into(), + path: "/b/array1".try_into().unwrap(), id: 5, user_attributes: Some(UserAttributesSnapshot::Ref(UserAttributesRef { object_id: oid.clone(), location: 42, - flags: Flags(), })), node_data: NodeData::Array( zarr_meta1.clone(), @@ -346,29 +360,43 @@ mod tests { ), }, NodeSnapshot { - path: "/array2".into(), + path: "/array2".try_into().unwrap(), id: 6, user_attributes: None, node_data: NodeData::Array(zarr_meta2.clone(), vec![]), }, NodeSnapshot { - path: "/b/array3".into(), + path: "/b/array3".try_into().unwrap(), id: 7, user_attributes: None, node_data: NodeData::Array(zarr_meta3.clone(), vec![]), }, ]; - let st = Snapshot::first_from_iter(None, nodes); + let initial = Snapshot::empty(); + let manifests = vec![ + ManifestFileInfo { + id: man_ref1.object_id.clone(), + format_version: format_constants::LATEST_ICECHUNK_MANIFEST_FORMAT, + }, + ManifestFileInfo { + id: man_ref2.object_id.clone(), + format_version: format_constants::LATEST_ICECHUNK_MANIFEST_FORMAT, + }, + ]; + let st = Snapshot::from_iter(&initial, None, manifests, vec![], nodes); + assert_eq!( - st.get_node(&"/nonexistent".into()), - Err(IcechunkFormatError::NodeNotFound { path: "/nonexistent".into() }) + st.get_node(&"/nonexistent".try_into().unwrap()), + Err(IcechunkFormatError::NodeNotFound { + path: "/nonexistent".try_into().unwrap() + }) ); - let node = st.get_node(&"/b/c".into()); + let node = st.get_node(&"/b/c".try_into().unwrap()); assert_eq!( node, Ok(&NodeSnapshot { - path: "/b/c".into(), + path: "/b/c".try_into().unwrap(), id: 4, user_attributes: Some(UserAttributesSnapshot::Inline( UserAttributes::try_new(br#"{"foo": "some inline"}"#).unwrap(), @@ -376,45 +404,44 @@ mod tests { node_data: NodeData::Group, }), ); - let node = st.get_node(&"/".into()); + let node = st.get_node(&Path::root()); assert_eq!( node, Ok(&NodeSnapshot { - path: "/".into(), + path: Path::root(), id: 1, user_attributes: None, node_data: NodeData::Group, }), ); - let node = st.get_node(&"/b/array1".into()); + let node = st.get_node(&"/b/array1".try_into().unwrap()); assert_eq!( node, Ok(&NodeSnapshot { - path: "/b/array1".into(), + path: "/b/array1".try_into().unwrap(), id: 5, user_attributes: Some(UserAttributesSnapshot::Ref(UserAttributesRef { object_id: oid, location: 42, - flags: Flags(), })), node_data: NodeData::Array(zarr_meta1.clone(), vec![man_ref1, man_ref2]), }), ); - let node = st.get_node(&"/array2".into()); + let node = st.get_node(&"/array2".try_into().unwrap()); assert_eq!( node, Ok(&NodeSnapshot { - path: "/array2".into(), + path: "/array2".try_into().unwrap(), id: 6, user_attributes: None, node_data: NodeData::Array(zarr_meta2.clone(), vec![]), }), ); - let node = st.get_node(&"/b/array3".into()); + let node = st.get_node(&"/b/array3".try_into().unwrap()); assert_eq!( node, Ok(&NodeSnapshot { - path: "/b/array3".into(), + path: "/b/array3".try_into().unwrap(), id: 7, user_attributes: None, node_data: NodeData::Array(zarr_meta3.clone(), vec![]), diff --git a/icechunk/src/lib.rs b/icechunk/src/lib.rs index 2ec87fb7..e46b48cb 100644 --- a/icechunk/src/lib.rs +++ b/icechunk/src/lib.rs @@ -7,7 +7,7 @@ //! - There is a low level interface that speaks zarr keys and values, and is used to provide the //! zarr store that will be used from python. This is the [`zarr::Store`] type. //! - There is a translation language between low and high levels. When user writes to a zarr key, -//! we need to convert that key to the language of arrays and groups. This is implmented it the +//! we need to convert that key to the language of arrays and groups. This is implemented it the //! [`zarr`] module //! - There is an abstract type for loading and saving of the Arrow datastructures. //! This is the [`Storage`] trait. It knows how to fetch and write arrow. @@ -17,6 +17,7 @@ //! - a caching wrapper implementation //! - The datastructures are represented by concrete types in the [`mod@format`] modules. //! These datastructures use Arrow RecordBatches for representation. +pub mod change_set; pub mod format; pub mod metadata; pub mod refs; @@ -29,3 +30,9 @@ pub mod zarr; pub use repository::{Repository, RepositoryBuilder, RepositoryConfig, SnapshotMetadata}; pub use storage::{MemCachingStorage, ObjectStorage, Storage, StorageError}; pub use zarr::Store; + +mod private { + /// Used to seal traits we don't want user code to implement, to maintain compatibility. + /// See https://rust-lang.github.io/api-guidelines/future-proofing.html#sealed-traits-protect-against-downstream-implementations-c-sealed + pub trait Sealed {} +} diff --git a/icechunk/src/metadata/data_type.rs b/icechunk/src/metadata/data_type.rs index c052d1be..4349c5f2 100644 --- a/icechunk/src/metadata/data_type.rs +++ b/icechunk/src/metadata/data_type.rs @@ -20,8 +20,8 @@ pub enum DataType { Float64, Complex64, Complex128, - // FIXME: serde serialization - RawBits(usize), + String, + Bytes, } impl DataType { @@ -67,17 +67,9 @@ impl TryFrom<&str> for DataType { "float64" => Ok(DataType::Float64), "complex64" => Ok(DataType::Complex64), "complex128" => Ok(DataType::Complex128), - _ => { - let mut it = value.chars(); - if it.next() == Some('r') { - it.as_str() - .parse() - .map(DataType::RawBits) - .map_err(|_| "Cannot parse RawBits size") - } else { - Err("Unknown data type, cannot parse") - } - } + "string" => Ok(DataType::String), + "bytes" => Ok(DataType::Bytes), + _ => Err("Unknown data type, cannot parse"), } } } @@ -100,7 +92,8 @@ impl Display for DataType { Float64 => f.write_str("float64"), Complex64 => f.write_str("complex64"), Complex128 => f.write_str("complex128"), - RawBits(usize) => write!(f, "r{}", usize), + String => f.write_str("string"), + Bytes => f.write_str("bytes"), } } } diff --git a/icechunk/src/metadata/fill_value.rs b/icechunk/src/metadata/fill_value.rs index 7f03f49e..31910248 100644 --- a/icechunk/src/metadata/fill_value.rs +++ b/icechunk/src/metadata/fill_value.rs @@ -1,3 +1,4 @@ +use itertools::Itertools; use serde::{Deserialize, Serialize}; use test_strategy::Arbitrary; @@ -22,7 +23,8 @@ pub enum FillValue { Float64(f64), Complex64(f32, f32), Complex128(f64, f64), - RawBits(Vec), + String(String), + Bytes(Vec), } impl FillValue { @@ -181,20 +183,29 @@ impl FillValue { } } - (DataType::RawBits(n), serde_json::Value::Array(arr)) if arr.len() == *n => { - let bits = arr + (DataType::String, serde_json::Value::String(s)) => { + Ok(FillValue::String(s.clone())) + } + + (DataType::Bytes, serde_json::Value::Array(arr)) => { + let bytes = arr .iter() .map(|b| FillValue::from_data_type_and_json(&DataType::UInt8, b)) .collect::, _>>()?; - Ok(FillValue::RawBits( - bits.iter() + Ok(FillValue::Bytes( + bytes + .iter() .map(|b| match b { - FillValue::UInt8(n) => *n, - _ => 0, + FillValue::UInt8(n) => Ok(*n), + _ => Err(IcechunkFormatError::FillValueParse { + data_type: dt.clone(), + value: value.clone(), + }), }) - .collect(), + .try_collect()?, )) } + _ => Err(IcechunkFormatError::FillValueParse { data_type: dt.clone(), value: value.clone(), @@ -218,7 +229,8 @@ impl FillValue { FillValue::Float64(_) => DataType::Float64, FillValue::Complex64(_, _) => DataType::Complex64, FillValue::Complex128(_, _) => DataType::Complex128, - FillValue::RawBits(v) => DataType::RawBits(v.len()), + FillValue::String(_) => DataType::String, + FillValue::Bytes(_) => DataType::Bytes, } } } diff --git a/icechunk/src/refs.rs b/icechunk/src/refs.rs index 68d8016f..bb52912a 100644 --- a/icechunk/src/refs.rs +++ b/icechunk/src/refs.rs @@ -8,13 +8,15 @@ use thiserror::Error; use crate::{format::SnapshotId, Storage, StorageError}; fn crock_encode_int(n: u64) -> String { - base32::encode(base32::Alphabet::Crockford, &n.to_be_bytes()) + // skip the first 3 bytes (zeroes) + base32::encode(base32::Alphabet::Crockford, &n.to_be_bytes()[3..=7]) } fn crock_decode_int(data: &str) -> Option { - let bytes = base32::decode(base32::Alphabet::Crockford, data)?; - let bytes = bytes.try_into().ok()?; - Some(u64::from_be_bytes(bytes)) + // re insert the first 3 bytes removed during encoding + let mut bytes = vec![0, 0, 0]; + bytes.extend(base32::decode(base32::Alphabet::Crockford, data)?); + Some(u64::from_be_bytes(bytes.as_slice().try_into().ok()?)) } #[derive(Debug, Error)] @@ -46,7 +48,7 @@ pub enum RefError { pub type RefResult = Result; -#[derive(Debug, Clone, Eq, PartialEq)] +#[derive(Debug, Clone, Eq, PartialEq, Hash)] pub enum Ref { Tag(String), Branch(String), @@ -56,9 +58,9 @@ impl Ref { pub const DEFAULT_BRANCH: &'static str = "main"; fn from_path(path: &str) -> RefResult { - match path.strip_prefix("tag:") { + match path.strip_prefix("tag.") { Some(name) => Ok(Ref::Tag(name.to_string())), - None => match path.strip_prefix("branch:") { + None => match path.strip_prefix("branch.") { Some(name) => Ok(Ref::Branch(name.to_string())), None => Err(RefError::InvalidRefType(path.to_string())), }, @@ -70,14 +72,16 @@ impl Ref { pub struct BranchVersion(pub u64); impl BranchVersion { + const MAX_VERSION_NUMBER: u64 = 1099511627775; + fn decode(version: &str) -> RefResult { let n = crock_decode_int(version) .ok_or(RefError::InvalidBranchVersion(version.to_string()))?; - Ok(BranchVersion(u64::MAX - n)) + Ok(BranchVersion(BranchVersion::MAX_VERSION_NUMBER - n)) } fn encode(&self) -> String { - crock_encode_int(u64::MAX - self.0) + crock_encode_int(BranchVersion::MAX_VERSION_NUMBER - self.0) } fn to_path(&self, branch_name: &str) -> RefResult { @@ -105,14 +109,14 @@ fn tag_key(tag_name: &str) -> RefResult { return Err(RefError::InvalidRefName(tag_name.to_string())); } - Ok(format!("tag:{}/{}", tag_name, TAG_KEY_NAME)) + Ok(format!("tag.{}/{}", tag_name, TAG_KEY_NAME)) } fn branch_root(branch_name: &str) -> RefResult { if branch_name.contains('/') { return Err(RefError::InvalidRefName(branch_name.to_string())); } - Ok(format!("branch:{}", branch_name)) + Ok(format!("branch.{}", branch_name)) } fn branch_key(branch_name: &str, version_id: &str) -> RefResult { @@ -196,7 +200,7 @@ async fn branch_history<'a, 'b>( branch: &'b str, ) -> RefResult> + 'a> { let key = branch_root(branch)?; - let all = storage.ref_versions(key.as_str()).await; + let all = storage.ref_versions(key.as_str()).await?; Ok(all.map_err(|e| e.into()).and_then(move |version_id| async move { let version = version_id .strip_suffix(".json") @@ -281,9 +285,24 @@ mod tests { #[tokio::test] async fn test_branch_version_encoding() -> Result<(), Box> { - let targets = (0..10u64).chain(once(u64::MAX)); + let targets = (0..10u64).chain(once(BranchVersion::MAX_VERSION_NUMBER)); + let encodings = [ + "ZZZZZZZZ", "ZZZZZZZY", "ZZZZZZZX", "ZZZZZZZW", "ZZZZZZZV", + // no U + "ZZZZZZZT", "ZZZZZZZS", "ZZZZZZZR", "ZZZZZZZQ", "ZZZZZZZP", + ]; + for n in targets { - let round = BranchVersion::decode(BranchVersion(n).encode().as_str())?; + let encoded = BranchVersion(n).encode(); + + if n < 100 { + assert_eq!(encoded, encodings[n as usize]); + } + if n == BranchVersion::MAX_VERSION_NUMBER { + assert_eq!(encoded, "00000000"); + } + + let round = BranchVersion::decode(encoded.as_str())?; assert_eq!(round, BranchVersion(n)); } Ok(()) @@ -291,7 +310,7 @@ mod tests { /// Execute the passed block with all test implementations of Storage. /// - /// Currently this function executes agains the in-memory and local filesystem object_store + /// Currently this function executes against the in-memory and local filesystem object_store /// implementations. async fn with_test_storages< R, diff --git a/icechunk/src/repository.rs b/icechunk/src/repository.rs index 86d55428..f44cb7e5 100644 --- a/icechunk/src/repository.rs +++ b/icechunk/src/repository.rs @@ -1,17 +1,12 @@ use std::{ - collections::{BTreeMap, HashMap, HashSet}, + collections::HashSet, iter::{self}, - mem::take, - path::PathBuf, pin::Pin, sync::Arc, }; -use crate::{ - format::{manifest::VirtualReferenceError, ManifestId, SnapshotId}, - storage::virtual_ref::{construct_valid_byte_range, VirtualChunkResolver}, -}; pub use crate::{ + change_set::ChangeSet, format::{ manifest::{ChunkPayload, VirtualChunkLocation}, snapshot::{SnapshotMetadata, ZarrArrayMetadata}, @@ -22,12 +17,21 @@ pub use crate::{ DimensionNames, FillValue, StorageTransformer, UserAttributes, }, }; +use crate::{ + format::{ + manifest::VirtualReferenceError, snapshot::ManifestFileInfo, ManifestId, + SnapshotId, + }, + storage::virtual_ref::{ + construct_valid_byte_range, ObjectStoreVirtualChunkResolverConfig, + VirtualChunkResolver, + }, +}; use bytes::Bytes; use chrono::Utc; use futures::{future::ready, Future, FutureExt, Stream, StreamExt, TryStreamExt}; use itertools::Either; use thiserror::Error; -use tokio::task; use crate::{ format::{ @@ -38,7 +42,7 @@ use crate::{ NodeData, NodeSnapshot, NodeType, Snapshot, SnapshotProperties, UserAttributesSnapshot, }, - ByteRange, Flags, IcechunkFormatError, NodeId, ObjectId, + ByteRange, IcechunkFormatError, NodeId, ObjectId, }, refs::{ create_tag, fetch_branch_tip, fetch_tag, update_branch, BranchVersion, Ref, @@ -51,17 +55,17 @@ use crate::{ #[derive(Clone, Debug)] pub struct RepositoryConfig { // Chunks smaller than this will be stored inline in the manifst - pub inline_threshold_bytes: u16, + pub inline_chunk_threshold_bytes: u16, // Unsafely overwrite refs on write. This is not recommended, users should only use it at their // own risk in object stores for which we don't support write-object-if-not-exists. There is - // teh posibility of race conditions if this variable is set to true and there are concurrent + // the possibility of race conditions if this variable is set to true and there are concurrent // commit attempts. pub unsafe_overwrite_refs: bool, } impl Default for RepositoryConfig { fn default() -> Self { - Self { inline_threshold_bytes: 512, unsafe_overwrite_refs: false } + Self { inline_chunk_threshold_bytes: 512, unsafe_overwrite_refs: false } } } @@ -75,155 +79,28 @@ pub struct Repository { virtual_resolver: Arc, } -#[derive(Clone, Debug, PartialEq, Default)] -struct ChangeSet { - new_groups: HashMap, - new_arrays: HashMap, - updated_arrays: HashMap, - // These paths may point to Arrays or Groups, - // since both Groups and Arrays support UserAttributes - updated_attributes: HashMap>, - // FIXME: issue with too many inline chunks kept in mem - set_chunks: HashMap>>, - deleted_groups: HashSet, - deleted_arrays: HashSet, -} - -impl ChangeSet { - fn is_empty(&self) -> bool { - self == &ChangeSet::default() - } - - fn add_group(&mut self, path: Path, node_id: NodeId) { - self.new_groups.insert(path, node_id); - } - - fn get_group(&self, path: &Path) -> Option<&NodeId> { - self.new_groups.get(path) - } - - fn get_array(&self, path: &Path) -> Option<&(NodeId, ZarrArrayMetadata)> { - self.new_arrays.get(path) - } - - fn delete_group(&mut self, path: &Path, node_id: NodeId) { - let new_node_id = self.new_groups.remove(path); - let is_new_group = new_node_id.is_some(); - debug_assert!(!is_new_group || new_node_id == Some(node_id)); - - self.updated_attributes.remove(&node_id); - if !is_new_group { - self.deleted_groups.insert(node_id); - } - } - - fn add_array(&mut self, path: Path, node_id: NodeId, metadata: ZarrArrayMetadata) { - self.new_arrays.insert(path, (node_id, metadata)); - } - - fn update_array(&mut self, node_id: NodeId, metadata: ZarrArrayMetadata) { - self.updated_arrays.insert(node_id, metadata); - } - - fn delete_array(&mut self, path: &Path, node_id: NodeId) { - // if deleting a new array created in this session, just remove the entry - // from new_arrays - let node_and_meta = self.new_arrays.remove(path); - let is_new_array = node_and_meta.is_some(); - debug_assert!(!is_new_array || node_and_meta.map(|n| n.0) == Some(node_id)); - - self.updated_arrays.remove(&node_id); - self.updated_attributes.remove(&node_id); - self.set_chunks.remove(&node_id); - if !is_new_array { - self.deleted_arrays.insert(node_id); - } - } - - fn get_updated_zarr_metadata(&self, node_id: NodeId) -> Option<&ZarrArrayMetadata> { - self.updated_arrays.get(&node_id) - } - - fn update_user_attributes(&mut self, node_id: NodeId, atts: Option) { - self.updated_attributes.insert(node_id, atts); - } - - fn get_user_attributes(&self, node_id: NodeId) -> Option<&Option> { - self.updated_attributes.get(&node_id) - } - - fn set_chunk_ref( - &mut self, - node_id: NodeId, - coord: ChunkIndices, - data: Option, - ) { - // this implementation makes delete idempotent - // it allows deleting a deleted chunk by repeatedly setting None. - self.set_chunks - .entry(node_id) - .and_modify(|h| { - h.insert(coord.clone(), data.clone()); - }) - .or_insert(HashMap::from([(coord, data)])); - } - - fn get_chunk_ref( - &self, - node_id: NodeId, - coords: &ChunkIndices, - ) -> Option<&Option> { - self.set_chunks.get(&node_id).and_then(|h| h.get(coords)) - } - - fn array_chunks_iterator( - &self, - node_id: NodeId, - ) -> impl Iterator)> { - match self.set_chunks.get(&node_id) { - None => Either::Left(iter::empty()), - Some(h) => Either::Right(h.iter()), - } - } - - fn new_arrays_chunk_iterator( - &self, - ) -> impl Iterator + '_ { - self.new_arrays.iter().flat_map(|(path, (node_id, _))| { - self.array_chunks_iterator(*node_id).filter_map(|(coords, payload)| { - payload.as_ref().map(|p| { - ( - path.clone(), - ChunkInfo { - node: *node_id, - coord: coords.clone(), - payload: p.clone(), - }, - ) - }) - }) - }) - } - - fn new_nodes(&self) -> impl Iterator { - self.new_groups.keys().chain(self.new_arrays.keys()) - } -} - #[derive(Debug, Clone)] pub struct RepositoryBuilder { config: RepositoryConfig, storage: Arc, snapshot_id: SnapshotId, + change_set: Option, + virtual_ref_config: Option, } impl RepositoryBuilder { fn new(storage: Arc, snapshot_id: SnapshotId) -> Self { - Self { config: RepositoryConfig::default(), snapshot_id, storage } + Self { + config: RepositoryConfig::default(), + snapshot_id, + storage, + change_set: None, + virtual_ref_config: None, + } } pub fn with_inline_threshold_bytes(&mut self, threshold: u16) -> &mut Self { - self.config.inline_threshold_bytes = threshold; + self.config.inline_chunk_threshold_bytes = threshold; self } @@ -237,18 +114,34 @@ impl RepositoryBuilder { self } + pub fn with_virtual_ref_config( + &mut self, + config: ObjectStoreVirtualChunkResolverConfig, + ) -> &mut Self { + self.virtual_ref_config = Some(config); + self + } + + pub fn with_change_set(&mut self, change_set_bytes: ChangeSet) -> &mut Self { + self.change_set = Some(change_set_bytes); + self + } + pub fn build(&self) -> Repository { Repository::new( self.config.clone(), self.storage.clone(), self.snapshot_id.clone(), + self.change_set.clone(), + self.virtual_ref_config.clone(), ) } } #[derive(Debug, Error)] +#[non_exhaustive] pub enum RepositoryError { - #[error("error contacting storage")] + #[error("error contacting storage {0}")] StorageError(#[from] StorageError), #[error("error in icechunk file")] FormatError(#[from] IcechunkFormatError), @@ -274,9 +167,13 @@ pub enum RepositoryError { AlreadyInitialized, #[error("error when handling virtual reference {0}")] VirtualReferenceError(#[from] VirtualReferenceError), + #[error("error in repository serialization `{0}`")] + SerializationError(#[from] rmp_serde::encode::Error), + #[error("error in repository deserialization `{0}`")] + DeserializationError(#[from] rmp_serde::decode::Error), } -type RepositoryResult = Result; +pub type RepositoryResult = Result; /// FIXME: what do we want to do with implicit groups? /// @@ -353,17 +250,47 @@ impl Repository { config: RepositoryConfig, storage: Arc, snapshot_id: SnapshotId, + change_set: Option, + virtual_ref_config: Option, ) -> Self { Repository { snapshot_id, config, storage, last_node_id: None, - change_set: ChangeSet::default(), - virtual_resolver: Arc::new(ObjectStoreVirtualChunkResolver::default()), + change_set: change_set.unwrap_or_default(), + virtual_resolver: Arc::new(ObjectStoreVirtualChunkResolver::new( + virtual_ref_config, + )), } } + pub fn config(&self) -> &RepositoryConfig { + &self.config + } + + pub(crate) fn set_snapshot_id(&mut self, snapshot_id: SnapshotId) { + self.snapshot_id = snapshot_id; + } + + pub(crate) async fn set_snapshot_from_tag( + &mut self, + tag: &str, + ) -> RepositoryResult<()> { + let ref_data = fetch_tag(self.storage.as_ref(), tag).await?; + self.snapshot_id = ref_data.snapshot; + Ok(()) + } + + pub(crate) async fn set_snapshot_from_branch( + &mut self, + branch: &str, + ) -> RepositoryResult<()> { + let ref_data = fetch_branch_tip(self.storage.as_ref(), branch).await?; + self.snapshot_id = ref_data.snapshot; + Ok(()) + } + /// Returns a pointer to the storage for the repository pub fn storage(&self) -> &Arc { &self.storage @@ -414,10 +341,18 @@ impl Repository { } } + /// Delete a group in the hierarchy + /// + /// Deletes of non existing groups will succeed. pub async fn delete_group(&mut self, path: Path) -> RepositoryResult<()> { - self.get_group(&path) - .await - .map(|node| self.change_set.delete_group(&node.path, node.id)) + match self.get_group(&path).await { + Ok(node) => { + self.change_set.delete_group(node.path, node.id); + } + Err(RepositoryError::NodeNotFound { .. }) => {} + Err(err) => Err(err)?, + } + Ok(()) } /// Add an array to the store. @@ -455,10 +390,18 @@ impl Repository { .map(|node| self.change_set.update_array(node.id, metadata)) } + /// Delete an array in the hierarchy + /// + /// Deletes of non existing array will succeed. pub async fn delete_array(&mut self, path: Path) -> RepositoryResult<()> { - self.get_array(&path) - .await - .map(|node| self.change_set.delete_array(&node.path, node.id)) + match self.get_array(&path).await { + Ok(node) => { + self.change_set.delete_array(node.path, node.id); + } + Err(RepositoryError::NodeNotFound { .. }) => {} + Err(err) => Err(err)?, + } + Ok(()) } /// Record the write or delete of user attributes to array or group @@ -504,26 +447,8 @@ impl Repository { Ok(new) } - // FIXME: add moves - pub async fn get_node(&self, path: &Path) -> RepositoryResult { - // We need to look for nodes in self.change_set and the snapshot file - match self.get_new_node(path) { - Some(node) => Ok(node), - None => { - let node = self.get_existing_node(path).await?; - if self.change_set.deleted_groups.contains(&node.id) - || self.change_set.deleted_arrays.contains(&node.id) - { - Err(RepositoryError::NodeNotFound { - path: path.clone(), - message: "getting node".to_string(), - }) - } else { - Ok(node) - } - } - } + get_node(self.storage.as_ref(), &self.change_set, self.snapshot_id(), path).await } pub async fn get_array(&self, path: &Path) -> RepositoryResult { @@ -548,77 +473,6 @@ impl Repository { } } - async fn get_existing_node(&self, path: &Path) -> RepositoryResult { - // An existing node is one that is present in a Snapshot file on storage - let snapshot_id = &self.snapshot_id; - let snapshot = self.storage.fetch_snapshot(snapshot_id).await?; - - let node = snapshot.get_node(path).map_err(|err| match err { - // A missing node here is not really a format error, so we need to - // generate the correct error for repositories - IcechunkFormatError::NodeNotFound { path } => RepositoryError::NodeNotFound { - path, - message: "existing node not found".to_string(), - }, - err => RepositoryError::FormatError(err), - })?; - let session_atts = self - .change_set - .get_user_attributes(node.id) - .cloned() - .map(|a| a.map(UserAttributesSnapshot::Inline)); - let res = NodeSnapshot { - user_attributes: session_atts.unwrap_or_else(|| node.user_attributes.clone()), - ..node.clone() - }; - if let Some(session_meta) = - self.change_set.get_updated_zarr_metadata(node.id).cloned() - { - if let NodeData::Array(_, manifests) = res.node_data { - Ok(NodeSnapshot { - node_data: NodeData::Array(session_meta, manifests), - ..res - }) - } else { - Ok(res) - } - } else { - Ok(res) - } - } - - fn get_new_node(&self, path: &Path) -> Option { - self.get_new_array(path).or(self.get_new_group(path)) - } - - fn get_new_array(&self, path: &Path) -> Option { - self.change_set.get_array(path).map(|(id, meta)| { - let meta = - self.change_set.get_updated_zarr_metadata(*id).unwrap_or(meta).clone(); - let atts = self.change_set.get_user_attributes(*id).cloned(); - NodeSnapshot { - id: *id, - path: path.clone(), - user_attributes: atts.flatten().map(UserAttributesSnapshot::Inline), - // We put no manifests in new arrays, see get_chunk_ref to understand how chunks get - // fetched for those arrays - node_data: NodeData::Array(meta.clone(), vec![]), - } - }) - } - - fn get_new_group(&self, path: &Path) -> Option { - self.change_set.get_group(path).map(|id| { - let atts = self.change_set.get_user_attributes(*id).cloned(); - NodeSnapshot { - id: *id, - path: path.clone(), - user_attributes: atts.flatten().map(UserAttributesSnapshot::Inline), - node_data: NodeData::Group, - } - }) - } - pub async fn get_chunk_ref( &self, path: &Path, @@ -637,6 +491,7 @@ impl Repository { // TODO: I hate rust forces me to clone to search in a hashmap. How to do better? let session_chunk = self.change_set.get_chunk_ref(node.id, coords).cloned(); + // If session_chunk is not None we have to return it, because is the update the // user made in the current session // If session_chunk == None, user hasn't modified the chunk in this session and we @@ -735,7 +590,7 @@ impl Repository { ) -> Pin< Box> + Send>, > { - let threshold = self.config.inline_threshold_bytes as usize; + let threshold = self.config.inline_chunk_threshold_bytes as usize; let storage = Arc::clone(&self.storage); move |data: Bytes| { async move { @@ -750,6 +605,19 @@ impl Repository { } } + pub async fn clear(&mut self) -> RepositoryResult<()> { + let to_delete: Vec<(NodeType, Path)> = + self.list_nodes().await?.map(|node| (node.node_type(), node.path)).collect(); + + for (t, p) in to_delete { + match t { + NodeType::Group => self.delete_group(p).await?, + NodeType::Array => self.delete_array(p).await?, + } + } + Ok(()) + } + async fn get_old_chunk( &self, node: NodeId, @@ -771,219 +639,40 @@ impl Repository { Ok(None) } - /// Warning: The presence of a single error may mean multiple missing items - async fn updated_chunk_iterator( - &self, - ) -> RepositoryResult> + '_> - { - let snapshot = self.storage.fetch_snapshot(&self.snapshot_id).await?; - let nodes = futures::stream::iter(snapshot.iter_arc()); - let res = nodes.then(move |node| async move { - let path = node.path.clone(); - self.node_chunk_iterator(node).await.map_ok(move |ci| (path.clone(), ci)) - }); - Ok(res.flatten()) - } - - /// Warning: The presence of a single error may mean multiple missing items - async fn node_chunk_iterator( - &self, - node: NodeSnapshot, - ) -> impl Stream> + '_ { - match node.node_data { - NodeData::Group => futures::future::Either::Left(futures::stream::empty()), - NodeData::Array(_, manifests) => { - let new_chunk_indices: Box> = Box::new( - self.change_set - .array_chunks_iterator(node.id) - .map(|(idx, _)| idx) - .collect(), - ); - - let new_chunks = self - .change_set - .array_chunks_iterator(node.id) - .filter_map(move |(idx, payload)| { - payload.as_ref().map(|payload| { - Ok(ChunkInfo { - node: node.id, - coord: idx.clone(), - payload: payload.clone(), - }) - }) - }); - - futures::future::Either::Right( - futures::stream::iter(new_chunks).chain( - futures::stream::iter(manifests) - .then(move |manifest_ref| { - let new_chunk_indices = new_chunk_indices.clone(); - async move { - let manifest = self - .storage - .fetch_manifests(&manifest_ref.object_id) - .await; - match manifest { - Ok(manifest) => { - let old_chunks = manifest - .iter(&node.id) - .filter(move |(coord, _)| { - !new_chunk_indices.contains(coord) - }) - .map(move |(coord, payload)| ChunkInfo { - node: node.id, - coord, - payload, - }); - - let old_chunks = self.update_existing_chunks( - node.id, old_chunks, - ); - futures::future::Either::Left( - futures::stream::iter(old_chunks.map(Ok)), - ) - } - // if we cannot even fetch the manifest, we generate a - // single error value. - Err(err) => futures::future::Either::Right( - futures::stream::once(ready(Err( - RepositoryError::StorageError(err), - ))), - ), - } - } - }) - .flatten(), - ), - ) - } - } - } - - fn update_existing_chunks<'a>( - &'a self, - node: NodeId, - chunks: impl Iterator + 'a, - ) -> impl Iterator + 'a { - chunks.filter_map(move |chunk| { - match self.change_set.get_chunk_ref(node, &chunk.coord) { - None => Some(chunk), - Some(new_payload) => { - new_payload.clone().map(|pl| ChunkInfo { payload: pl, ..chunk }) - } - } - }) - } - - async fn updated_existing_nodes<'a>( - &'a self, - manifest_id: &'a ManifestId, - ) -> RepositoryResult + 'a> { - // TODO: solve this duplication, there is always the possibility of this being the first - // version - let updated_nodes = - self.storage.fetch_snapshot(&self.snapshot_id).await?.iter_arc().map( - move |node| { - let new_manifests = if node.node_type() == NodeType::Array { - //FIXME: it could be none for empty arrays - Some(vec![ManifestRef { - object_id: manifest_id.clone(), - flags: Flags(), - extents: ManifestExtents(vec![]), - }]) - } else { - None - }; - self.update_existing_node(node, new_manifests) - }, - ); - - Ok(updated_nodes) - } - - fn new_nodes<'a>( - &'a self, - manifest_id: &'a ManifestId, - ) -> impl Iterator + 'a { - self.change_set.new_nodes().map(move |path| { - // we should be able to create the full node because we - // know it's a new node - #[allow(clippy::expect_used)] - let node = self.get_new_node(path).expect("Bug in new_nodes implementation"); - match node.node_data { - NodeData::Group => node, - NodeData::Array(meta, _no_manifests_yet) => { - let new_manifests = vec![ManifestRef { - object_id: manifest_id.clone(), - flags: Flags(), - extents: ManifestExtents(vec![]), - }]; - NodeSnapshot { - node_data: NodeData::Array(meta, new_manifests), - ..node - } - } - } - }) - } - - async fn updated_nodes<'a>( - &'a self, - manifest_id: &'a ManifestId, - ) -> RepositoryResult + 'a> { - Ok(self - .updated_existing_nodes(manifest_id) - .await? - .chain(self.new_nodes(manifest_id))) - } - - fn update_existing_node( - &self, - node: NodeSnapshot, - new_manifests: Option>, - ) -> NodeSnapshot { - let session_atts = self - .change_set - .get_user_attributes(node.id) - .cloned() - .map(|a| a.map(UserAttributesSnapshot::Inline)); - let new_atts = session_atts.unwrap_or(node.user_attributes); - match node.node_data { - NodeData::Group => NodeSnapshot { user_attributes: new_atts, ..node }, - NodeData::Array(old_zarr_meta, _) => { - let new_zarr_meta = self - .change_set - .get_updated_zarr_metadata(node.id) - .cloned() - .unwrap_or(old_zarr_meta); - - NodeSnapshot { - // FIXME: bad option type, change - node_data: NodeData::Array( - new_zarr_meta, - new_manifests.unwrap_or_default(), - ), - user_attributes: new_atts, - ..node - } - } - } - } - pub async fn list_nodes( &self, ) -> RepositoryResult + '_> { - self.updated_nodes(&ObjectId::FAKE).await + updated_nodes(self.storage.as_ref(), &self.change_set, &self.snapshot_id, None) + .await } pub async fn all_chunks( &self, - ) -> RepositoryResult> + '_> + ) -> RepositoryResult> + '_> { - let existing_array_chunks = self.updated_chunk_iterator().await?; - let new_array_chunks = - futures::stream::iter(self.change_set.new_arrays_chunk_iterator().map(Ok)); - Ok(existing_array_chunks.chain(new_array_chunks)) + all_chunks(self.storage.as_ref(), &self.change_set, self.snapshot_id()).await + } + + pub async fn distributed_flush>( + &mut self, + other_change_sets: I, + message: &str, + properties: SnapshotProperties, + ) -> RepositoryResult { + // FIXME: this clone can be avoided + let change_sets = iter::once(self.change_set.clone()).chain(other_change_sets); + let new_snapshot_id = distributed_flush( + self.storage.as_ref(), + change_sets, + self.snapshot_id(), + message, + properties, + ) + .await?; + + self.snapshot_id = new_snapshot_id.clone(); + self.change_set = ChangeSet::default(); + Ok(new_snapshot_id) } /// After changes to the repository have been made, this generates and writes to `Storage` the updated datastructures. @@ -998,89 +687,7 @@ impl Repository { message: &str, properties: SnapshotProperties, ) -> RepositoryResult { - if !self.has_uncommitted_changes() { - return Err(RepositoryError::NoChangesToCommit); - } - // We search for the current manifest. We are assumming a single one for now - let old_snapshot = self.storage().fetch_snapshot(&self.snapshot_id).await?; - let old_snapshot_c = Arc::clone(&old_snapshot); - let manifest_id = old_snapshot_c.iter_arc().find_map(|node| { - match node.node_data { - NodeData::Array(_, man) => { - // TODO: can we avoid clone - man.first().map(|manifest| manifest.object_id.clone()) - } - NodeData::Group => None, - } - }); - - let old_manifest = match manifest_id { - Some(ref manifest_id) => self.storage.fetch_manifests(manifest_id).await?, - // If there is no previous manifest we create an empty one - None => Arc::new(Manifest::default()), - }; - - // The manifest update process is CPU intensive, so we want to executed it on a worker - // thread. Currently it's also destructive of the manifest, so we are also cloning the - // old manifest data - // - // The update process requires reference access to the set_chunks map, since we are running - // it on blocking task, it wants that reference to be 'static, which we cannot provide. - // As a solution, we temporarily `take` the map, replacing it an empty one, run the thread, - // and at the end we put the map back to where it was, in case there is some later failure. - // We always want to leave things in the previous state if there was a failure. - - let chunk_changes = Arc::new(take(&mut self.change_set.set_chunks)); - let chunk_changes_c = Arc::clone(&chunk_changes); - - let update_task = task::spawn_blocking(move || { - //FIXME: avoid clone, this one is extremely expensive en memory - //it's currently needed because we don't want to destroy the manifest in case of later - //failure - let mut new_chunks = old_manifest.as_ref().chunks.clone(); - update_manifest(&mut new_chunks, &chunk_changes_c); - (new_chunks, chunk_changes) - }); - - match update_task.await { - Ok((new_chunks, chunk_changes)) => { - // reset the set_chunks map to it's previous value - #[allow(clippy::expect_used)] - { - // It's OK to call into_inner here because we created the Arc locally and never - // shared it with other code - self.change_set.set_chunks = - Arc::into_inner(chunk_changes).expect("Bug in flush task join"); - } - - let new_manifest = Arc::new(Manifest { chunks: new_chunks }); - let new_manifest_id = ObjectId::random(); - self.storage - .write_manifests(new_manifest_id.clone(), new_manifest) - .await?; - - let all_nodes = self.updated_nodes(&new_manifest_id).await?; - - let mut new_snapshot = Snapshot::child_from_iter( - old_snapshot.as_ref(), - Some(properties), - all_nodes, - ); - new_snapshot.metadata.message = message.to_string(); - new_snapshot.metadata.written_at = Utc::now(); - - let new_snapshot = Arc::new(new_snapshot); - let new_snapshot_id = &new_snapshot.metadata.id; - self.storage - .write_snapshot(new_snapshot_id.clone(), Arc::clone(&new_snapshot)) - .await?; - - self.snapshot_id = new_snapshot_id.clone(); - self.change_set = ChangeSet::default(); - Ok(new_snapshot_id.clone()) - } - Err(_) => Err(RepositoryError::OtherFlushError), - } + self.distributed_flush(iter::empty(), message, properties).await } pub async fn commit( @@ -1088,11 +695,28 @@ impl Repository { update_branch_name: &str, message: &str, properties: Option, + ) -> RepositoryResult { + self.distributed_commit(update_branch_name, iter::empty(), message, properties) + .await + } + + pub async fn distributed_commit>( + &mut self, + update_branch_name: &str, + other_change_sets: I, + message: &str, + properties: Option, ) -> RepositoryResult { let current = fetch_branch_tip(self.storage.as_ref(), update_branch_name).await; match current { Err(RefError::RefNotFound(_)) => { - self.do_commit(update_branch_name, message, properties).await + self.do_distributed_commit( + update_branch_name, + other_change_sets, + message, + properties, + ) + .await } Err(err) => Err(err.into()), Ok(ref_data) => { @@ -1103,21 +727,29 @@ impl Repository { actual_parent: Some(ref_data.snapshot.clone()), }) } else { - self.do_commit(update_branch_name, message, properties).await + self.do_distributed_commit( + update_branch_name, + other_change_sets, + message, + properties, + ) + .await } } } } - async fn do_commit( + async fn do_distributed_commit>( &mut self, update_branch_name: &str, + other_change_sets: I, message: &str, properties: Option, ) -> RepositoryResult { let parent_snapshot = self.snapshot_id.clone(); let properties = properties.unwrap_or_default(); - let new_snapshot = self.flush(message, properties).await?; + let new_snapshot = + self.distributed_flush(other_change_sets, message, properties).await?; match update_branch( self.storage.as_ref(), @@ -1136,6 +768,10 @@ impl Repository { } } + pub fn change_set_bytes(&self) -> RepositoryResult> { + self.change_set.export_to_bytes() + } + pub async fn new_branch(&self, branch_name: &str) -> RepositoryResult { // TODO: The parent snapshot should exist? let version = match update_branch( @@ -1173,6 +809,12 @@ impl Repository { } } +impl From for ChangeSet { + fn from(val: Repository) -> Self { + val.change_set + } +} + async fn new_materialized_chunk( storage: &(dyn Storage + Send + Sync), data: Bytes, @@ -1186,26 +828,6 @@ fn new_inline_chunk(data: Bytes) -> ChunkPayload { ChunkPayload::Inline(data) } -fn update_manifest( - original_chunks: &mut BTreeMap<(NodeId, ChunkIndices), ChunkPayload>, - set_chunks: &HashMap>>, -) { - for (node_id, chunks) in set_chunks.iter() { - for (coord, maybe_payload) in chunks.iter() { - match maybe_payload { - Some(payload) => { - // a chunk was updated or inserted - original_chunks.insert((*node_id, coord.clone()), payload.clone()); - } - None => { - // a chunk was deleted - original_chunks.remove(&(*node_id, coord.clone())); - } - } - } - } -} - pub async fn get_chunk( reader: Option> + Send>>>, ) -> RepositoryResult> { @@ -1215,11 +837,286 @@ pub async fn get_chunk( } } +async fn updated_existing_nodes<'a>( + storage: &(dyn Storage + Send + Sync), + change_set: &'a ChangeSet, + parent_id: &SnapshotId, + manifest_id: Option<&'a ManifestId>, +) -> RepositoryResult + 'a> { + let manifest_refs = manifest_id.map(|mid| { + vec![ManifestRef { object_id: mid.clone(), extents: ManifestExtents(vec![]) }] + }); + let updated_nodes = + storage.fetch_snapshot(parent_id).await?.iter_arc().filter_map(move |node| { + let new_manifests = if node.node_type() == NodeType::Array { + //FIXME: it could be none for empty arrays + manifest_refs.clone() + } else { + None + }; + change_set.update_existing_node(node, new_manifests) + }); + + Ok(updated_nodes) +} + +async fn updated_nodes<'a>( + storage: &(dyn Storage + Send + Sync), + change_set: &'a ChangeSet, + parent_id: &SnapshotId, + manifest_id: Option<&'a ManifestId>, +) -> RepositoryResult + 'a> { + Ok(updated_existing_nodes(storage, change_set, parent_id, manifest_id) + .await? + .chain(change_set.new_nodes_iterator(manifest_id))) +} + +async fn get_node<'a>( + storage: &(dyn Storage + Send + Sync), + change_set: &'a ChangeSet, + snapshot_id: &SnapshotId, + path: &Path, +) -> RepositoryResult { + // We need to look for nodes in self.change_set and the snapshot file + if change_set.is_deleted(path) { + return Err(RepositoryError::NodeNotFound { + path: path.clone(), + message: "getting node".to_string(), + }); + } + match change_set.get_new_node(path) { + Some(node) => Ok(node), + None => { + let node = get_existing_node(storage, change_set, snapshot_id, path).await?; + if change_set.is_deleted(&node.path) { + Err(RepositoryError::NodeNotFound { + path: path.clone(), + message: "getting node".to_string(), + }) + } else { + Ok(node) + } + } + } +} + +async fn get_existing_node<'a>( + storage: &(dyn Storage + Send + Sync), + change_set: &'a ChangeSet, + snapshot_id: &SnapshotId, + path: &Path, +) -> RepositoryResult { + // An existing node is one that is present in a Snapshot file on storage + let snapshot = storage.fetch_snapshot(snapshot_id).await?; + + let node = snapshot.get_node(path).map_err(|err| match err { + // A missing node here is not really a format error, so we need to + // generate the correct error for repositories + IcechunkFormatError::NodeNotFound { path } => RepositoryError::NodeNotFound { + path, + message: "existing node not found".to_string(), + }, + err => RepositoryError::FormatError(err), + })?; + let session_atts = change_set + .get_user_attributes(node.id) + .cloned() + .map(|a| a.map(UserAttributesSnapshot::Inline)); + let res = NodeSnapshot { + user_attributes: session_atts.unwrap_or_else(|| node.user_attributes.clone()), + ..node.clone() + }; + if let Some(session_meta) = change_set.get_updated_zarr_metadata(node.id).cloned() { + if let NodeData::Array(_, manifests) = res.node_data { + Ok(NodeSnapshot { + node_data: NodeData::Array(session_meta, manifests), + ..res + }) + } else { + Ok(res) + } + } else { + Ok(res) + } +} + +async fn distributed_flush>( + storage: &(dyn Storage + Send + Sync), + change_sets: I, + parent_id: &SnapshotId, + message: &str, + properties: SnapshotProperties, +) -> RepositoryResult { + let mut change_set = ChangeSet::default(); + change_set.merge_many(change_sets); + if change_set.is_empty() { + return Err(RepositoryError::NoChangesToCommit); + } + + let chunks = all_chunks(storage, &change_set, parent_id) + .await? + .map_ok(|(_path, chunk_info)| chunk_info); + + let new_manifest = Arc::new(Manifest::from_stream(chunks).await?); + let new_manifest_id = if new_manifest.len() > 0 { + let id = ObjectId::random(); + storage.write_manifests(id.clone(), Arc::clone(&new_manifest)).await?; + Some(id) + } else { + None + }; + + let all_nodes = + updated_nodes(storage, &change_set, parent_id, new_manifest_id.as_ref()).await?; + + let old_snapshot = storage.fetch_snapshot(parent_id).await?; + let mut new_snapshot = Snapshot::from_iter( + old_snapshot.as_ref(), + Some(properties), + new_manifest_id + .as_ref() + .map(|mid| { + vec![ManifestFileInfo { + id: mid.clone(), + format_version: new_manifest.icechunk_manifest_format_version, + }] + }) + .unwrap_or_default(), + vec![], + all_nodes, + ); + new_snapshot.metadata.message = message.to_string(); + new_snapshot.metadata.written_at = Utc::now(); + + let new_snapshot = Arc::new(new_snapshot); + let new_snapshot_id = &new_snapshot.metadata.id; + storage.write_snapshot(new_snapshot_id.clone(), Arc::clone(&new_snapshot)).await?; + + Ok(new_snapshot_id.clone()) +} + +/// Warning: The presence of a single error may mean multiple missing items +async fn updated_chunk_iterator<'a>( + storage: &'a (dyn Storage + Send + Sync), + change_set: &'a ChangeSet, + snapshot_id: &'a SnapshotId, +) -> RepositoryResult> + 'a> { + let snapshot = storage.fetch_snapshot(snapshot_id).await?; + let nodes = futures::stream::iter(snapshot.iter_arc()); + let res = nodes.then(move |node| async move { + let path = node.path.clone(); + node_chunk_iterator(storage, change_set, snapshot_id, &node.path) + .await + .map_ok(move |ci| (path.clone(), ci)) + }); + Ok(res.flatten()) +} + +/// Warning: The presence of a single error may mean multiple missing items +async fn node_chunk_iterator<'a>( + storage: &'a (dyn Storage + Send + Sync), + change_set: &'a ChangeSet, + snapshot_id: &SnapshotId, + path: &Path, +) -> impl Stream> + 'a { + match get_node(storage, change_set, snapshot_id, path).await { + Ok(node) => futures::future::Either::Left( + verified_node_chunk_iterator(storage, change_set, node).await, + ), + Err(_) => futures::future::Either::Right(futures::stream::empty()), + } +} + +/// Warning: The presence of a single error may mean multiple missing items +async fn verified_node_chunk_iterator<'a>( + storage: &'a (dyn Storage + Send + Sync), + change_set: &'a ChangeSet, + node: NodeSnapshot, +) -> impl Stream> + 'a { + match node.node_data { + NodeData::Group => futures::future::Either::Left(futures::stream::empty()), + NodeData::Array(_, manifests) => { + let new_chunk_indices: Box> = Box::new( + change_set + .array_chunks_iterator(node.id, &node.path) + .map(|(idx, _)| idx) + .collect(), + ); + + let new_chunks = change_set + .array_chunks_iterator(node.id, &node.path) + .filter_map(move |(idx, payload)| { + payload.as_ref().map(|payload| { + Ok(ChunkInfo { + node: node.id, + coord: idx.clone(), + payload: payload.clone(), + }) + }) + }); + + futures::future::Either::Right( + futures::stream::iter(new_chunks).chain( + futures::stream::iter(manifests) + .then(move |manifest_ref| { + let new_chunk_indices = new_chunk_indices.clone(); + async move { + let manifest = storage + .fetch_manifests(&manifest_ref.object_id) + .await; + match manifest { + Ok(manifest) => { + let old_chunks = manifest + .iter(&node.id) + .filter(move |(coord, _)| { + !new_chunk_indices.contains(coord) + }) + .map(move |(coord, payload)| ChunkInfo { + node: node.id, + coord, + payload, + }); + + let old_chunks = change_set + .update_existing_chunks(node.id, old_chunks); + futures::future::Either::Left( + futures::stream::iter(old_chunks.map(Ok)), + ) + } + // if we cannot even fetch the manifest, we generate a + // single error value. + Err(err) => futures::future::Either::Right( + futures::stream::once(ready(Err( + RepositoryError::StorageError(err), + ))), + ), + } + } + }) + .flatten(), + ), + ) + } + } +} + +async fn all_chunks<'a>( + storage: &'a (dyn Storage + Send + Sync), + change_set: &'a ChangeSet, + snapshot_id: &'a SnapshotId, +) -> RepositoryResult> + 'a> { + let existing_array_chunks = + updated_chunk_iterator(storage, change_set, snapshot_id).await?; + let new_array_chunks = + futures::stream::iter(change_set.new_arrays_chunk_iterator().map(Ok)); + Ok(existing_array_chunks.chain(new_array_chunks)) +} + #[cfg(test)] #[allow(clippy::panic, clippy::unwrap_used, clippy::expect_used)] mod tests { - use std::{error::Error, num::NonZeroU64, path::PathBuf}; + use std::{error::Error, num::NonZeroU64}; use crate::{ format::manifest::ChunkInfo, @@ -1266,12 +1163,8 @@ mod tests { // deleting the added group must succeed prop_assert!(repository.delete_group(path.clone()).await.is_ok()); - // deleting twice must fail - let matches = matches!( - repository.delete_group(path.clone()).await.unwrap_err(), - RepositoryError::NodeNotFound{path: reported_path, ..} if reported_path == path - ); - prop_assert!(matches); + // deleting twice must succeed + prop_assert!(repository.delete_group(path.clone()).await.is_ok()); // getting a deleted group must fail prop_assert!(repository.get_node(&path).await.is_err()); @@ -1298,12 +1191,8 @@ mod tests { // first delete must succeed prop_assert!(repository.delete_array(path.clone()).await.is_ok()); - // deleting twice must fail - let matches = matches!( - repository.delete_array(path.clone()).await.unwrap_err(), - RepositoryError::NodeNotFound{path: reported_path, ..} if reported_path == path - ); - prop_assert!(matches); + // deleting twice must succeed + prop_assert!(repository.delete_array(path.clone()).await.is_ok()); // adding again must succeed prop_assert!(repository.add_array(path.clone(), metadata.clone()).await.is_ok()); @@ -1372,7 +1261,7 @@ mod tests { let manifest = Arc::new(vec![chunk1.clone(), chunk2.clone()].into_iter().collect()); let manifest_id = ObjectId::random(); - storage.write_manifests(manifest_id.clone(), manifest).await?; + storage.write_manifests(manifest_id.clone(), Arc::clone(&manifest)).await?; let zarr_meta1 = ZarrArrayMetadata { shape: vec![2, 2, 2], @@ -1396,14 +1285,13 @@ mod tests { ]), }; let manifest_ref = ManifestRef { - object_id: manifest_id, - flags: Flags(), + object_id: manifest_id.clone(), extents: ManifestExtents(vec![]), }; - let array1_path: PathBuf = "/array1".to_string().into(); + let array1_path: Path = "/array1".try_into().unwrap(); let nodes = vec![ NodeSnapshot { - path: "/".into(), + path: Path::root(), id: 1, user_attributes: None, node_data: NodeData::Group, @@ -1418,7 +1306,18 @@ mod tests { }, ]; - let snapshot = Arc::new(Snapshot::first_from_iter(None, nodes.iter().cloned())); + let initial = Snapshot::empty(); + let manifests = vec![ManifestFileInfo { + id: manifest_id.clone(), + format_version: manifest.icechunk_manifest_format_version, + }]; + let snapshot = Arc::new(Snapshot::from_iter( + &initial, + None, + manifests, + vec![], + nodes.iter().cloned(), + )); let snapshot_id = ObjectId::random(); storage.write_snapshot(snapshot_id.clone(), snapshot).await?; let mut ds = Repository::update(Arc::new(storage), snapshot_id) @@ -1430,13 +1329,14 @@ mod tests { assert_eq!(nodes.get(1).unwrap(), &node); let group_name = "/tbd-group".to_string(); - ds.add_group(group_name.clone().into()).await?; - ds.delete_group(group_name.clone().into()).await?; - assert!(ds.delete_group(group_name.clone().into()).await.is_err()); - assert!(ds.get_node(&group_name.into()).await.is_err()); + ds.add_group(group_name.clone().try_into().unwrap()).await?; + ds.delete_group(group_name.clone().try_into().unwrap()).await?; + // deleting non-existing is no-op + assert!(ds.delete_group(group_name.clone().try_into().unwrap()).await.is_ok()); + assert!(ds.get_node(&group_name.try_into().unwrap()).await.is_err()); // add a new array and retrieve its node - ds.add_group("/group".to_string().into()).await?; + ds.add_group("/group".try_into().unwrap()).await?; let zarr_meta2 = ZarrArrayMetadata { shape: vec![3], @@ -1452,13 +1352,12 @@ mod tests { dimension_names: Some(vec![Some("t".to_string())]), }; - let new_array_path: PathBuf = "/group/array2".to_string().into(); + let new_array_path: Path = "/group/array2".to_string().try_into().unwrap(); ds.add_array(new_array_path.clone(), zarr_meta2.clone()).await?; ds.delete_array(new_array_path.clone()).await?; - // Delete a non-existent array - assert!(ds.delete_array(new_array_path.clone()).await.is_err()); - assert!(ds.delete_array(new_array_path.clone()).await.is_err()); + // Delete a non-existent array is no-op + assert!(ds.delete_array(new_array_path.clone()).await.is_ok()); assert!(ds.get_node(&new_array_path.clone()).await.is_err()); ds.add_array(new_array_path.clone(), zarr_meta2.clone()).await?; @@ -1484,7 +1383,7 @@ mod tests { assert_eq!( node.ok(), Some(NodeSnapshot { - path: "/group/array2".into(), + path: "/group/array2".try_into().unwrap(), id: 6, user_attributes: Some(UserAttributesSnapshot::Inline( UserAttributes::try_new(br#"{"n":42}"#).unwrap() @@ -1504,7 +1403,7 @@ mod tests { let non_chunk = ds.get_chunk_ref(&new_array_path, &ChunkIndices(vec![1])).await?; assert_eq!(non_chunk, None); - // update old array use attriutes and check them + // update old array use attributes and check them ds.set_user_attributes( array1_path.clone(), Some(UserAttributes::try_new(br#"{"updated": true}"#).unwrap()), @@ -1550,14 +1449,11 @@ mod tests { .await?; assert_eq!(chunk, Some(data)); - let path: Path = "/group/array2".into(); + let path: Path = "/group/array2".try_into().unwrap(); let node = ds.get_node(&path).await; - assert!(ds - .change_set - .updated_attributes - .contains_key(&node.as_ref().unwrap().id)); + assert!(ds.change_set.has_updated_attributes(&node.as_ref().unwrap().id)); assert!(ds.delete_array(path.clone()).await.is_ok()); - assert!(!ds.change_set.updated_attributes.contains_key(&node?.id)); + assert!(!ds.change_set.has_updated_attributes(&node?.id)); Ok(()) } @@ -1589,8 +1485,8 @@ mod tests { ]), }; - change_set.add_array("foo/bar".into(), 1, zarr_meta.clone()); - change_set.add_array("foo/baz".into(), 2, zarr_meta); + change_set.add_array("/foo/bar".try_into().unwrap(), 1, zarr_meta.clone()); + change_set.add_array("/foo/baz".try_into().unwrap(), 2, zarr_meta); assert_eq!(None, change_set.new_arrays_chunk_iterator().next()); change_set.set_chunk_ref(1, ChunkIndices(vec![0, 1]), None); @@ -1624,7 +1520,7 @@ mod tests { .collect(); let expected_chunks: Vec<_> = [ ( - "foo/baz".into(), + "/foo/baz".try_into().unwrap(), ChunkInfo { node: 2, coord: ChunkIndices(vec![0]), @@ -1632,7 +1528,7 @@ mod tests { }, ), ( - "foo/baz".into(), + "/foo/baz".try_into().unwrap(), ChunkInfo { node: 2, coord: ChunkIndices(vec![1]), @@ -1640,7 +1536,7 @@ mod tests { }, ), ( - "foo/bar".into(), + "/foo/bar".try_into().unwrap(), ChunkInfo { node: 1, coord: ChunkIndices(vec![1, 0]), @@ -1648,7 +1544,7 @@ mod tests { }, ), ( - "foo/bar".into(), + "/foo/bar".try_into().unwrap(), ChunkInfo { node: 1, coord: ChunkIndices(vec![1, 1]), @@ -1673,35 +1569,35 @@ mod tests { let mut ds = Repository::init(Arc::clone(&storage), false).await?.build(); // add a new array and retrieve its node - ds.add_group("/".into()).await?; + ds.add_group(Path::root()).await?; let snapshot_id = ds.flush("commit", SnapshotProperties::default()).await?; assert_eq!(snapshot_id, ds.snapshot_id); assert_eq!( - ds.get_node(&"/".into()).await.ok(), + ds.get_node(&Path::root()).await.ok(), Some(NodeSnapshot { id: 1, - path: "/".into(), + path: Path::root(), user_attributes: None, node_data: NodeData::Group }) ); - ds.add_group("/group".into()).await?; + ds.add_group("/group".try_into().unwrap()).await?; let _snapshot_id = ds.flush("commit", SnapshotProperties::default()).await?; assert_eq!( - ds.get_node(&"/".into()).await.ok(), + ds.get_node(&Path::root()).await.ok(), Some(NodeSnapshot { id: 1, - path: "/".into(), + path: Path::root(), user_attributes: None, node_data: NodeData::Group }) ); assert_eq!( - ds.get_node(&"/group".into()).await.ok(), + ds.get_node(&"/group".try_into().unwrap()).await.ok(), Some(NodeSnapshot { id: 2, - path: "/group".into(), + path: "/group".try_into().unwrap(), user_attributes: None, node_data: NodeData::Group }) @@ -1720,7 +1616,7 @@ mod tests { dimension_names: Some(vec![Some("t".to_string())]), }; - let new_array_path: PathBuf = "/group/array1".to_string().into(); + let new_array_path: Path = "/group/array1".try_into().unwrap(); ds.add_array(new_array_path.clone(), zarr_meta.clone()).await?; // wo commit to test the case of a chunkless array @@ -1736,19 +1632,19 @@ mod tests { let _snapshot_id = ds.flush("commit", SnapshotProperties::default()).await?; assert_eq!( - ds.get_node(&"/".into()).await.ok(), + ds.get_node(&Path::root()).await.ok(), Some(NodeSnapshot { id: 1, - path: "/".into(), + path: Path::root(), user_attributes: None, node_data: NodeData::Group }) ); assert_eq!( - ds.get_node(&"/group".into()).await.ok(), + ds.get_node(&"/group".try_into().unwrap()).await.ok(), Some(NodeSnapshot { id: 2, - path: "/group".into(), + path: "/group".try_into().unwrap(), user_attributes: None, node_data: NodeData::Group }) @@ -1832,6 +1728,9 @@ mod tests { atts == UserAttributesSnapshot::Inline(UserAttributes::try_new(br#"{"foo":42}"#).unwrap()) )); + // since we wrote every asset and we are using a caching storage, we should never need to fetch them + assert!(logging.fetch_operations().is_empty()); + //test the previous version is still alive let ds = Repository::update(Arc::clone(&storage), previous_snapshot_id).build(); assert_eq!( @@ -1843,8 +1742,257 @@ mod tests { Some(ChunkPayload::Inline("new chunk".into())) ); - // since we write every asset and we are using a caching storage, we should never need to fetch them - assert!(logging.fetch_operations().is_empty()); + Ok(()) + } + + #[tokio::test] + async fn test_basic_delete_and_flush() -> Result<(), Box> { + let storage: Arc = + Arc::new(ObjectStorage::new_in_memory_store(Some("prefix".into()))); + let mut ds = Repository::init(Arc::clone(&storage), false).await?.build(); + ds.add_group(Path::root()).await?; + ds.add_group("/1".try_into().unwrap()).await?; + ds.delete_group("/1".try_into().unwrap()).await?; + assert_eq!(ds.list_nodes().await?.count(), 1); + ds.commit("main", "commit", None).await?; + assert!(ds.get_group(&Path::root()).await.is_ok()); + assert!(ds.get_group(&"/1".try_into().unwrap()).await.is_err()); + assert_eq!(ds.list_nodes().await?.count(), 1); + Ok(()) + } + + #[tokio::test] + async fn test_basic_delete_after_flush() -> Result<(), Box> { + let storage: Arc = + Arc::new(ObjectStorage::new_in_memory_store(Some("prefix".into()))); + let mut ds = Repository::init(Arc::clone(&storage), false).await?.build(); + ds.add_group(Path::root()).await?; + ds.add_group("/1".try_into().unwrap()).await?; + ds.commit("main", "commit", None).await?; + + ds.delete_group("/1".try_into().unwrap()).await?; + assert!(ds.get_group(&Path::root()).await.is_ok()); + assert!(ds.get_group(&"/1".try_into().unwrap()).await.is_err()); + assert_eq!(ds.list_nodes().await?.count(), 1); + Ok(()) + } + + #[tokio::test] + async fn test_commit_after_deleting_old_node() -> Result<(), Box> { + let storage: Arc = + Arc::new(ObjectStorage::new_in_memory_store(Some("prefix".into()))); + let mut ds = Repository::init(Arc::clone(&storage), false).await?.build(); + ds.add_group(Path::root()).await?; + ds.commit("main", "commit", None).await?; + ds.delete_group(Path::root()).await?; + ds.commit("main", "commit", None).await?; + assert_eq!(ds.list_nodes().await?.count(), 0); + Ok(()) + } + + #[tokio::test] + async fn test_delete_children() -> Result<(), Box> { + let storage: Arc = + Arc::new(ObjectStorage::new_in_memory_store(Some("prefix".into()))); + let mut ds = Repository::init(Arc::clone(&storage), false).await?.build(); + ds.add_group(Path::root()).await?; + ds.add_group("/a".try_into().unwrap()).await?; + ds.add_group("/b".try_into().unwrap()).await?; + ds.add_group("/b/bb".try_into().unwrap()).await?; + ds.delete_group("/b".try_into().unwrap()).await?; + assert!(ds.get_group(&"/b".try_into().unwrap()).await.is_err()); + assert!(ds.get_group(&"/b/bb".try_into().unwrap()).await.is_err()); + Ok(()) + } + + #[tokio::test] + async fn test_delete_children_of_old_nodes() -> Result<(), Box> { + let storage: Arc = + Arc::new(ObjectStorage::new_in_memory_store(Some("prefix".into()))); + let mut ds = Repository::init(Arc::clone(&storage), false).await?.build(); + ds.add_group(Path::root()).await?; + ds.add_group("/a".try_into().unwrap()).await?; + ds.add_group("/b".try_into().unwrap()).await?; + ds.add_group("/b/bb".try_into().unwrap()).await?; + ds.commit("main", "commit", None).await?; + + ds.delete_group("/b".try_into().unwrap()).await?; + assert!(ds.get_group(&"/b".try_into().unwrap()).await.is_err()); + assert!(ds.get_group(&"/b/bb".try_into().unwrap()).await.is_err()); + Ok(()) + } + + #[tokio::test] + async fn test_manifests_shrink() -> Result<(), Box> { + let in_mem_storage = + Arc::new(ObjectStorage::new_in_memory_store(Some("prefix".into()))); + let storage: Arc = in_mem_storage.clone(); + let mut ds = Repository::init(Arc::clone(&storage), false).await?.build(); + + // there should be no manifests yet + assert!(!in_mem_storage + .all_keys() + .await? + .iter() + .any(|key| key.contains("manifest"))); + + // initialization creates one snapshot + assert_eq!( + 1, + in_mem_storage + .all_keys() + .await? + .iter() + .filter(|key| key.contains("snapshot")) + .count(), + ); + + ds.add_group(Path::root()).await?; + let zarr_meta = ZarrArrayMetadata { + shape: vec![5, 5], + data_type: DataType::Float16, + chunk_shape: ChunkShape(vec![NonZeroU64::new(2).unwrap()]), + chunk_key_encoding: ChunkKeyEncoding::Slash, + fill_value: FillValue::Float16(f32::NEG_INFINITY), + codecs: vec![Codec { name: "mycodec".to_string(), configuration: None }], + storage_transformers: Some(vec![StorageTransformer { + name: "mytransformer".to_string(), + configuration: None, + }]), + dimension_names: Some(vec![Some("t".to_string())]), + }; + + let a1path: Path = "/array1".try_into()?; + let a2path: Path = "/array2".try_into()?; + + ds.add_array(a1path.clone(), zarr_meta.clone()).await?; + ds.add_array(a2path.clone(), zarr_meta.clone()).await?; + + let _ = ds.commit("main", "first commit", None).await?; + + // there should be no manifests yet because we didn't add any chunks + assert_eq!( + 0, + in_mem_storage + .all_keys() + .await? + .iter() + .filter(|key| key.contains("manifest")) + .count(), + ); + // there should be two snapshots, one for the initialization commit and one for the real + // commit + assert_eq!( + 2, + in_mem_storage + .all_keys() + .await? + .iter() + .filter(|key| key.contains("snapshot")) + .count(), + ); + + // add 3 chunks + ds.set_chunk_ref( + a1path.clone(), + ChunkIndices(vec![0, 0]), + Some(ChunkPayload::Inline("hello".into())), + ) + .await?; + ds.set_chunk_ref( + a1path.clone(), + ChunkIndices(vec![0, 1]), + Some(ChunkPayload::Inline("hello".into())), + ) + .await?; + ds.set_chunk_ref( + a2path.clone(), + ChunkIndices(vec![0, 1]), + Some(ChunkPayload::Inline("hello".into())), + ) + .await?; + + ds.commit("main", "commit", None).await?; + + // there should be one manifest now + assert_eq!( + 1, + in_mem_storage + .all_keys() + .await? + .iter() + .filter(|key| key.contains("manifest")) + .count() + ); + + let manifest_id = match ds.get_array(&a1path).await?.node_data { + NodeData::Array(_, manifests) => { + manifests.first().as_ref().unwrap().object_id.clone() + } + NodeData::Group => panic!("must be an array"), + }; + let manifest = storage.fetch_manifests(&manifest_id).await?; + let initial_size = manifest.len(); + + ds.delete_array(a2path).await?; + ds.commit("main", "array2 deleted", None).await?; + + // there should be two manifests + assert_eq!( + 2, + in_mem_storage + .all_keys() + .await? + .iter() + .filter(|key| key.contains("manifest")) + .count() + ); + + let manifest_id = match ds.get_array(&a1path).await?.node_data { + NodeData::Array(_, manifests) => { + manifests.first().as_ref().unwrap().object_id.clone() + } + NodeData::Group => panic!("must be an array"), + }; + let manifest = storage.fetch_manifests(&manifest_id).await?; + let size_after_delete = manifest.len(); + + assert!(size_after_delete < initial_size); + + // delete a chunk + ds.set_chunk_ref(a1path.clone(), ChunkIndices(vec![0, 0]), None).await?; + ds.commit("main", "chunk deleted", None).await?; + + // there should be three manifests + assert_eq!( + 3, + in_mem_storage + .all_keys() + .await? + .iter() + .filter(|key| key.contains("manifest")) + .count() + ); + // there should be five snapshots + assert_eq!( + 5, + in_mem_storage + .all_keys() + .await? + .iter() + .filter(|key| key.contains("snapshot")) + .count(), + ); + + let manifest_id = match ds.get_array(&a1path).await?.node_data { + NodeData::Array(_, manifests) => { + manifests.first().as_ref().unwrap().object_id.clone() + } + NodeData::Group => panic!("must be an array"), + }; + let manifest = storage.fetch_manifests(&manifest_id).await?; + let size_after_chunk_delete = manifest.len(); + assert!(size_after_chunk_delete < size_after_delete); Ok(()) } @@ -1856,7 +2004,7 @@ mod tests { let mut ds = Repository::init(Arc::clone(&storage), false).await?.build(); // add a new array and retrieve its node - ds.add_group("/".into()).await?; + ds.add_group(Path::root()).await?; let zarr_meta = ZarrArrayMetadata { shape: vec![1, 1, 2], data_type: DataType::Int32, @@ -1871,7 +2019,7 @@ mod tests { dimension_names: Some(vec![Some("t".to_string())]), }; - let new_array_path: PathBuf = "/array".to_string().into(); + let new_array_path: Path = "/array".try_into().unwrap(); ds.add_array(new_array_path.clone(), zarr_meta.clone()).await?; // we 3 chunks ds.set_chunk_ref( @@ -1920,7 +2068,7 @@ mod tests { let mut ds = Repository::init(Arc::clone(&storage), false).await?.build(); // add a new array and retrieve its node - ds.add_group("/".into()).await?; + ds.add_group(Path::root()).await?; let new_snapshot_id = ds.commit(Ref::DEFAULT_BRANCH, "first commit", None).await?; assert_eq!( @@ -1935,10 +2083,10 @@ mod tests { assert_eq!(new_snapshot_id, ref_data.snapshot); assert_eq!( - ds.get_node(&"/".into()).await.ok(), + ds.get_node(&Path::root()).await.ok(), Some(NodeSnapshot { id: 1, - path: "/".into(), + path: Path::root(), user_attributes: None, node_data: NodeData::Group }) @@ -1947,10 +2095,10 @@ mod tests { let mut ds = Repository::from_branch_tip(Arc::clone(&storage), "main").await?.build(); assert_eq!( - ds.get_node(&"/".into()).await.ok(), + ds.get_node(&Path::root()).await.ok(), Some(NodeSnapshot { id: 1, - path: "/".into(), + path: Path::root(), user_attributes: None, node_data: NodeData::Group }) @@ -1969,7 +2117,7 @@ mod tests { dimension_names: Some(vec![Some("t".to_string())]), }; - let new_array_path: PathBuf = "/array1".to_string().into(); + let new_array_path: Path = "/array1".try_into().unwrap(); ds.add_array(new_array_path.clone(), zarr_meta.clone()).await?; ds.set_chunk_ref( new_array_path.clone(), @@ -2006,8 +2154,8 @@ mod tests { let mut ds2 = Repository::from_branch_tip(Arc::clone(&storage), "main").await?.build(); - ds1.add_group("a".into()).await?; - ds2.add_group("b".into()).await?; + ds1.add_group("/a".try_into().unwrap()).await?; + ds2.add_group("/b".try_into().unwrap()).await?; let barrier = Arc::new(Barrier::new(2)); let barrier_c = Arc::clone(&barrier); @@ -2175,6 +2323,8 @@ mod tests { "Attempting to delete a non-existent path: {path}", ); state.groups.swap_remove(index); + state.groups.retain(|group| !group.starts_with(path)); + state.arrays.retain(|array, _| !array.starts_with(path)); } _ => panic!(), } diff --git a/icechunk/src/storage/caching.rs b/icechunk/src/storage/caching.rs index 1800ec45..34a2a6d1 100644 --- a/icechunk/src/storage/caching.rs +++ b/icechunk/src/storage/caching.rs @@ -5,9 +5,12 @@ use bytes::Bytes; use futures::stream::BoxStream; use quick_cache::sync::Cache; -use crate::format::{ - attributes::AttributesTable, manifest::Manifest, snapshot::Snapshot, AttributesId, - ByteRange, ChunkId, ManifestId, SnapshotId, +use crate::{ + format::{ + attributes::AttributesTable, manifest::Manifest, snapshot::Snapshot, + AttributesId, ByteRange, ChunkId, ManifestId, SnapshotId, + }, + private, }; use super::{Storage, StorageError, StorageResult}; @@ -39,6 +42,8 @@ impl MemCachingStorage { } } +impl private::Sealed for MemCachingStorage {} + #[async_trait] impl Storage for MemCachingStorage { async fn fetch_snapshot( @@ -152,7 +157,10 @@ impl Storage for MemCachingStorage { self.backend.write_ref(ref_key, overwrite_refs, bytes).await } - async fn ref_versions(&self, ref_name: &str) -> BoxStream> { + async fn ref_versions( + &self, + ref_name: &str, + ) -> StorageResult>> { self.backend.ref_versions(ref_name).await } } diff --git a/icechunk/src/storage/logging.rs b/icechunk/src/storage/logging.rs index 7e5334b1..904010ad 100644 --- a/icechunk/src/storage/logging.rs +++ b/icechunk/src/storage/logging.rs @@ -5,9 +5,12 @@ use bytes::Bytes; use futures::stream::BoxStream; use super::{Storage, StorageError, StorageResult}; -use crate::format::{ - attributes::AttributesTable, manifest::Manifest, snapshot::Snapshot, AttributesId, - ByteRange, ChunkId, ManifestId, SnapshotId, +use crate::{ + format::{ + attributes::AttributesTable, manifest::Manifest, snapshot::Snapshot, + AttributesId, ByteRange, ChunkId, ManifestId, SnapshotId, + }, + private, }; #[derive(Debug)] @@ -28,6 +31,8 @@ impl LoggingStorage { } } +impl private::Sealed for LoggingStorage {} + #[async_trait] #[allow(clippy::expect_used)] // this implementation is intended for tests only impl Storage for LoggingStorage { @@ -121,7 +126,10 @@ impl Storage for LoggingStorage { self.backend.write_ref(ref_key, overwrite_refs, bytes).await } - async fn ref_versions(&self, ref_name: &str) -> BoxStream> { + async fn ref_versions( + &self, + ref_name: &str, + ) -> StorageResult>> { self.backend.ref_versions(ref_name).await } } diff --git a/icechunk/src/storage/mod.rs b/icechunk/src/storage/mod.rs index f8a9f10d..c63f5351 100644 --- a/icechunk/src/storage/mod.rs +++ b/icechunk/src/storage/mod.rs @@ -1,6 +1,15 @@ +use aws_sdk_s3::{ + config::http::HttpResponse, + error::SdkError, + operation::{ + get_object::GetObjectError, list_objects_v2::ListObjectsV2Error, + put_object::PutObjectError, + }, + primitives::ByteStreamError, +}; use core::fmt; use futures::stream::BoxStream; -use std::sync::Arc; +use std::{ffi::OsString, sync::Arc}; use async_trait::async_trait; use bytes::Bytes; @@ -12,44 +21,54 @@ pub mod caching; pub mod logging; pub mod object_store; +pub mod s3; pub mod virtual_ref; pub use caching::MemCachingStorage; pub use object_store::ObjectStorage; -use crate::format::{ - attributes::AttributesTable, manifest::Manifest, snapshot::Snapshot, AttributesId, - ByteRange, ChunkId, ManifestId, Path, SnapshotId, +use crate::{ + format::{ + attributes::AttributesTable, manifest::Manifest, snapshot::Snapshot, + AttributesId, ByteRange, ChunkId, ManifestId, SnapshotId, + }, + private, }; #[derive(Debug, Error)] pub enum StorageError { #[error("error contacting object store {0}")] ObjectStore(#[from] ::object_store::Error), + #[error("bad object store prefix {0:?}")] + BadPrefix(OsString), + #[error("error getting object from object store {0}")] + S3GetObjectError(#[from] SdkError), + #[error("error writing object to object store {0}")] + S3PutObjectError(#[from] SdkError), + #[error("error listing objects in object store {0}")] + S3ListObjectError(#[from] SdkError), + #[error("error streaming bytes from object store {0}")] + S3StreamError(#[from] ByteStreamError), #[error("messagepack decode error: {0}")] MsgPackDecodeError(#[from] rmp_serde::decode::Error), #[error("messagepack encode error: {0}")] MsgPackEncodeError(#[from] rmp_serde::encode::Error), - #[error("error parsing RecordBatch from parquet file {0}.")] - BadRecordBatchRead(Path), #[error("cannot overwrite ref: {0}")] RefAlreadyExists(String), #[error("ref not found: {0}")] RefNotFound(String), - #[error("generic storage error: {0}")] - OtherError(#[from] Arc), #[error("unknown storage error: {0}")] Other(String), } -type StorageResult = Result; +pub type StorageResult = Result; /// Fetch and write the parquet files that represent the repository in object store /// /// Different implementation can cache the files differently, or not at all. /// Implementations are free to assume files are never overwritten. #[async_trait] -pub trait Storage: fmt::Debug { +pub trait Storage: fmt::Debug + private::Sealed { async fn fetch_snapshot(&self, id: &SnapshotId) -> StorageResult>; async fn fetch_attributes( &self, @@ -77,7 +96,10 @@ pub trait Storage: fmt::Debug { async fn get_ref(&self, ref_key: &str) -> StorageResult; async fn ref_names(&self) -> StorageResult>; - async fn ref_versions(&self, ref_name: &str) -> BoxStream>; + async fn ref_versions( + &self, + ref_name: &str, + ) -> StorageResult>>; async fn write_ref( &self, ref_key: &str, diff --git a/icechunk/src/storage/object_store.rs b/icechunk/src/storage/object_store.rs index 6fb53645..ee929391 100644 --- a/icechunk/src/storage/object_store.rs +++ b/icechunk/src/storage/object_store.rs @@ -1,18 +1,21 @@ -use crate::format::{ - attributes::AttributesTable, manifest::Manifest, snapshot::Snapshot, AttributesId, - ByteRange, ChunkId, FileTypeTag, ManifestId, ObjectId, SnapshotId, +use crate::{ + format::{ + attributes::AttributesTable, format_constants, manifest::Manifest, + snapshot::Snapshot, AttributesId, ByteRange, ChunkId, FileTypeTag, ManifestId, + ObjectId, SnapshotId, + }, + private, }; use async_trait::async_trait; use bytes::Bytes; -use core::fmt; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use object_store::{ - local::LocalFileSystem, memory::InMemory, path::Path as ObjectPath, GetOptions, - GetRange, ObjectStore, PutMode, PutOptions, PutPayload, + local::LocalFileSystem, memory::InMemory, path::Path as ObjectPath, Attribute, + AttributeValue, Attributes, GetOptions, GetRange, ObjectStore, PutMode, PutOptions, + PutPayload, }; -use serde::{Deserialize, Serialize}; use std::{ - fs::create_dir_all, future::ready, ops::Bound, path::Path as StdPath, sync::Arc, + fs::create_dir_all, future::ready, ops::Range, path::Path as StdPath, sync::Arc, }; use super::{Storage, StorageError, StorageResult}; @@ -20,32 +23,13 @@ use super::{Storage, StorageError, StorageResult}; // Get Range is object_store specific, keep it with this module impl From<&ByteRange> for Option { fn from(value: &ByteRange) -> Self { - match (value.0, value.1) { - (Bound::Included(start), Bound::Excluded(end)) => { - Some(GetRange::Bounded(start as usize..end as usize)) + match value { + ByteRange::Bounded(Range { start, end }) => { + Some(GetRange::Bounded(*start as usize..*end as usize)) } - (Bound::Included(start), Bound::Unbounded) => { - Some(GetRange::Offset(start as usize)) - } - (Bound::Included(start), Bound::Included(end)) => { - Some(GetRange::Bounded(start as usize..end as usize + 1)) - } - (Bound::Excluded(start), Bound::Excluded(end)) => { - Some(GetRange::Bounded(start as usize + 1..end as usize)) - } - (Bound::Excluded(start), Bound::Unbounded) => { - Some(GetRange::Offset(start as usize + 1)) - } - (Bound::Excluded(start), Bound::Included(end)) => { - Some(GetRange::Bounded(start as usize + 1..end as usize + 1)) - } - (Bound::Unbounded, Bound::Excluded(end)) => { - Some(GetRange::Bounded(0..end as usize)) - } - (Bound::Unbounded, Bound::Included(end)) => { - Some(GetRange::Bounded(0..end as usize + 1)) - } - (Bound::Unbounded, Bound::Unbounded) => None, + ByteRange::From(start) if *start == 0u64 => None, + ByteRange::From(start) => Some(GetRange::Offset(*start as usize)), + ByteRange::Last(n) => Some(GetRange::Suffix(*n as usize)), } } } @@ -56,13 +40,7 @@ const MANIFEST_PREFIX: &str = "manifests/"; const CHUNK_PREFIX: &str = "chunks/"; const REF_PREFIX: &str = "refs"; -#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] -pub struct S3Credentials { - pub access_key_id: String, - pub secret_access_key: String, - pub session_token: Option, -} - +#[derive(Debug)] pub struct ObjectStorage { store: Arc, prefix: String, @@ -70,13 +48,12 @@ pub struct ObjectStorage { // implementation is used only for tests, it's OK to sort in memory. artificially_sort_refs_in_mem: bool, - // We need this because object_store's hasn't implemented support for create-if-not-exists in - // S3 yet. We'll delete this after they do. supports_create_if_not_exists: bool, + supports_metadata: bool, } impl ObjectStorage { - /// Create an in memory Storage implementantion + /// Create an in memory Storage implementation /// /// This implementation should not be used in production code. pub fn new_in_memory_store(prefix: Option) -> ObjectStorage { @@ -88,10 +65,11 @@ impl ObjectStorage { prefix, artificially_sort_refs_in_mem: false, supports_create_if_not_exists: true, + supports_metadata: true, } } - /// Create an local filesystem Storage implementantion + /// Create an local filesystem Storage implementation /// /// This implementation should not be used in production code. pub fn new_local_store(prefix: &StdPath) -> Result { @@ -103,69 +81,43 @@ impl ObjectStorage { prefix: "".to_string(), artificially_sort_refs_in_mem: true, supports_create_if_not_exists: true, + supports_metadata: false, }) } - pub fn new_s3_store( - bucket_name: impl Into, - prefix: impl Into, - credentials: Option, - endpoint: Option>, - ) -> Result { - use object_store::aws::AmazonS3Builder; - - let builder = if let Some(credentials) = credentials { - let builder = AmazonS3Builder::new() - .with_access_key_id(credentials.access_key_id) - .with_secret_access_key(credentials.secret_access_key); - - if let Some(token) = credentials.session_token { - builder.with_token(token) - } else { - builder - } - } else { - AmazonS3Builder::from_env() - }; - - let builder = if let Some(endpoint) = endpoint { - builder.with_endpoint(endpoint).with_allow_http(true) - } else { - builder - }; - - let store = builder.with_bucket_name(bucket_name.into()).build()?; - Ok(ObjectStorage { - store: Arc::new(store), - prefix: prefix.into(), - artificially_sort_refs_in_mem: false, - // FIXME: this will go away once object_store supports create-if-not-exist on S3 - supports_create_if_not_exists: false, - }) + /// Return all keys in the store + /// + /// Intended for testing and debugging purposes only. + pub async fn all_keys(&self) -> StorageResult> { + Ok(self + .store + .list(None) + .map_ok(|obj| obj.location.to_string()) + .try_collect() + .await?) } fn get_path( &self, file_prefix: &str, - extension: &str, id: &ObjectId, ) -> ObjectPath { // TODO: be careful about allocation here // we serialize the url using crockford - let path = format!("{}/{}/{}{}", self.prefix, file_prefix, id, extension); + let path = format!("{}/{}/{}", self.prefix, file_prefix, id); ObjectPath::from(path) } fn get_snapshot_path(&self, id: &SnapshotId) -> ObjectPath { - self.get_path(SNAPSHOT_PREFIX, ".msgpack", id) + self.get_path(SNAPSHOT_PREFIX, id) } fn get_manifest_path(&self, id: &ManifestId) -> ObjectPath { - self.get_path(MANIFEST_PREFIX, ".msgpack", id) + self.get_path(MANIFEST_PREFIX, id) } fn get_chunk_path(&self, id: &ChunkId) -> ObjectPath { - self.get_path(CHUNK_PREFIX, "", id) + self.get_path(CHUNK_PREFIX, id) } fn drop_prefix(&self, prefix: &ObjectPath, path: &ObjectPath) -> Option { @@ -195,11 +147,8 @@ impl ObjectStorage { } } -impl fmt::Debug for ObjectStorage { - fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { - write!(f, "ObjectStorage, prefix={}, store={}", self.prefix, self.store) - } -} +impl private::Sealed for ObjectStorage {} + #[async_trait] impl Storage for ObjectStorage { async fn fetch_snapshot( @@ -234,12 +183,33 @@ impl Storage for ObjectStorage { async fn write_snapshot( &self, id: SnapshotId, - table: Arc, + snapshot: Arc, ) -> Result<(), StorageError> { let path = self.get_snapshot_path(&id); - let bytes = rmp_serde::to_vec(table.as_ref())?; + let bytes = rmp_serde::to_vec(snapshot.as_ref())?; + let attributes = if self.supports_metadata { + Attributes::from_iter(vec![ + ( + Attribute::ContentType, + AttributeValue::from( + format_constants::LATEST_ICECHUNK_SNAPSHOT_CONTENT_TYPE, + ), + ), + ( + Attribute::Metadata(std::borrow::Cow::Borrowed( + format_constants::LATEST_ICECHUNK_SNAPSHOT_VERSION_METADATA_KEY, + )), + AttributeValue::from( + snapshot.icechunk_snapshot_format_version.to_string(), + ), + ), + ]) + } else { + Attributes::new() + }; + let options = PutOptions { attributes, ..PutOptions::default() }; // FIXME: use multipart - self.store.put(&path, bytes.into()).await?; + self.store.put_opts(&path, bytes.into(), options).await?; Ok(()) } @@ -254,12 +224,33 @@ impl Storage for ObjectStorage { async fn write_manifests( &self, id: ManifestId, - table: Arc, + manifest: Arc, ) -> Result<(), StorageError> { let path = self.get_manifest_path(&id); - let bytes = rmp_serde::to_vec(table.as_ref())?; + let bytes = rmp_serde::to_vec(manifest.as_ref())?; + let attributes = if self.supports_metadata { + Attributes::from_iter(vec![ + ( + Attribute::ContentType, + AttributeValue::from( + format_constants::LATEST_ICECHUNK_MANIFEST_CONTENT_TYPE, + ), + ), + ( + Attribute::Metadata(std::borrow::Cow::Borrowed( + format_constants::LATEST_ICECHUNK_MANIFEST_VERSION_METADATA_KEY, + )), + AttributeValue::from( + manifest.icechunk_manifest_format_version.to_string(), + ), + ), + ]) + } else { + Attributes::new() + }; + let options = PutOptions { attributes, ..PutOptions::default() }; // FIXME: use multipart - self.store.put(&path, bytes.into()).await?; + self.store.put_opts(&path, bytes.into(), options).await?; Ok(()) } @@ -319,7 +310,10 @@ impl Storage for ObjectStorage { .collect()) } - async fn ref_versions(&self, ref_name: &str) -> BoxStream> { + async fn ref_versions( + &self, + ref_name: &str, + ) -> StorageResult>> { let res = self.do_ref_versions(ref_name).await; if self.artificially_sort_refs_in_mem { #[allow(clippy::expect_used)] @@ -329,9 +323,9 @@ impl Storage for ObjectStorage { let mut all = res.try_collect::>().await.expect("Error fetching ref versions"); all.sort(); - futures::stream::iter(all.into_iter().map(Ok)).boxed() + Ok(futures::stream::iter(all.into_iter().map(Ok)).boxed()) } else { - res + Ok(res) } } diff --git a/icechunk/src/storage/s3.rs b/icechunk/src/storage/s3.rs new file mode 100644 index 00000000..543cc00e --- /dev/null +++ b/icechunk/src/storage/s3.rs @@ -0,0 +1,420 @@ +use std::{ops::Range, path::PathBuf, sync::Arc}; + +use async_stream::try_stream; +use async_trait::async_trait; +use aws_config::{meta::region::RegionProviderChain, AppName, BehaviorVersion}; +use aws_credential_types::Credentials; +use aws_sdk_s3::{ + config::{Builder, Region}, + error::ProvideErrorMetadata, + primitives::ByteStream, + Client, +}; +use bytes::Bytes; +use futures::StreamExt; +use serde::{Deserialize, Serialize}; + +use crate::{ + format::{ + attributes::AttributesTable, format_constants, manifest::Manifest, + snapshot::Snapshot, AttributesId, ByteRange, ChunkId, FileTypeTag, ManifestId, + SnapshotId, + }, + private, + zarr::ObjectId, + Storage, StorageError, +}; + +use super::StorageResult; + +#[derive(Debug)] +pub struct S3Storage { + client: Arc, + prefix: String, + bucket: String, +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)] +pub struct StaticS3Credentials { + pub access_key_id: String, + pub secret_access_key: String, + pub session_token: Option, +} + +#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq, Default)] +#[serde(tag = "type")] +pub enum S3Credentials { + #[default] + #[serde(rename = "from_env")] + FromEnv, + #[serde(rename = "anonymous")] + Anonymous, + #[serde(rename = "static")] + Static(StaticS3Credentials), +} + +#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)] +pub struct S3Config { + pub region: Option, + pub endpoint: Option, + pub credentials: S3Credentials, + pub allow_http: bool, +} + +pub async fn mk_client(config: Option<&S3Config>) -> Client { + let region = config + .and_then(|c| c.region.as_ref()) + .map(|r| RegionProviderChain::first_try(Some(Region::new(r.clone())))) + .unwrap_or_else(RegionProviderChain::default_provider); + + let endpoint = config.and_then(|c| c.endpoint.clone()); + let allow_http = config.map(|c| c.allow_http).unwrap_or(false); + let credentials = + config.map(|c| c.credentials.clone()).unwrap_or(S3Credentials::FromEnv); + #[allow(clippy::unwrap_used)] + let app_name = AppName::new("icechunk").unwrap(); + let mut aws_config = aws_config::defaults(BehaviorVersion::v2024_03_28()) + .region(region) + .app_name(app_name); + + if let Some(endpoint) = endpoint { + aws_config = aws_config.endpoint_url(endpoint) + } + + match credentials { + S3Credentials::FromEnv => {} + S3Credentials::Anonymous => aws_config = aws_config.no_credentials(), + S3Credentials::Static(credentials) => { + aws_config = aws_config.credentials_provider(Credentials::new( + credentials.access_key_id, + credentials.secret_access_key, + credentials.session_token, + None, + "user", + )); + } + } + + let mut s3_builder = Builder::from(&aws_config.load().await); + + if allow_http { + s3_builder = s3_builder.force_path_style(true); + } + + let config = s3_builder.build(); + + Client::from_conf(config) +} + +const SNAPSHOT_PREFIX: &str = "snapshots/"; +const MANIFEST_PREFIX: &str = "manifests/"; +// const ATTRIBUTES_PREFIX: &str = "attributes/"; +const CHUNK_PREFIX: &str = "chunks/"; +const REF_PREFIX: &str = "refs"; + +impl S3Storage { + pub async fn new_s3_store( + bucket_name: impl Into, + prefix: impl Into, + config: Option<&S3Config>, + ) -> Result { + let client = Arc::new(mk_client(config).await); + Ok(S3Storage { client, prefix: prefix.into(), bucket: bucket_name.into() }) + } + + fn get_path( + &self, + file_prefix: &str, + id: &ObjectId, + ) -> StorageResult { + // we serialize the url using crockford + let path = PathBuf::from_iter([ + self.prefix.as_str(), + file_prefix, + id.to_string().as_str(), + ]); + path.into_os_string().into_string().map_err(StorageError::BadPrefix) + } + + fn get_snapshot_path(&self, id: &SnapshotId) -> StorageResult { + self.get_path(SNAPSHOT_PREFIX, id) + } + + fn get_manifest_path(&self, id: &ManifestId) -> StorageResult { + self.get_path(MANIFEST_PREFIX, id) + } + + fn get_chunk_path(&self, id: &ChunkId) -> StorageResult { + self.get_path(CHUNK_PREFIX, id) + } + + fn ref_key(&self, ref_key: &str) -> StorageResult { + let path = PathBuf::from_iter([self.prefix.as_str(), REF_PREFIX, ref_key]); + path.into_os_string().into_string().map_err(StorageError::BadPrefix) + } + + async fn get_object(&self, key: &str) -> StorageResult { + Ok(self + .client + .get_object() + .bucket(self.bucket.clone()) + .key(key) + .send() + .await? + .body + .collect() + .await? + .into_bytes()) + } + + async fn get_object_range( + &self, + key: &str, + range: &ByteRange, + ) -> StorageResult { + let mut b = self.client.get_object().bucket(self.bucket.clone()).key(key); + + if let Some(header) = range_to_header(range) { + b = b.range(header) + }; + + Ok(b.send().await?.body.collect().await?.into_bytes()) + } + + async fn put_object< + I: IntoIterator, impl Into)>, + >( + &self, + key: &str, + content_type: Option>, + metadata: I, + bytes: impl Into, + ) -> StorageResult<()> { + let mut b = self.client.put_object().bucket(self.bucket.clone()).key(key); + + if let Some(ct) = content_type { + b = b.content_type(ct) + }; + + for (k, v) in metadata { + b = b.metadata(k, v); + } + + b.body(bytes.into()).send().await?; + Ok(()) + } +} + +pub fn range_to_header(range: &ByteRange) -> Option { + match range { + ByteRange::Bounded(Range { start, end }) => { + Some(format!("bytes={}-{}", start, end - 1)) + } + ByteRange::From(offset) if *offset == 0 => None, + ByteRange::From(offset) => Some(format!("bytes={}-", offset)), + ByteRange::Last(n) => Some(format!("bytes={}-", n)), + } +} + +impl private::Sealed for S3Storage {} + +#[async_trait] +impl Storage for S3Storage { + async fn fetch_snapshot(&self, id: &SnapshotId) -> StorageResult> { + let key = self.get_snapshot_path(id)?; + let bytes = self.get_object(key.as_str()).await?; + // TODO: optimize using from_read + let res = rmp_serde::from_slice(bytes.as_ref())?; + Ok(Arc::new(res)) + } + + async fn fetch_attributes( + &self, + _id: &AttributesId, + ) -> StorageResult> { + todo!() + } + + async fn fetch_manifests(&self, id: &ManifestId) -> StorageResult> { + let key = self.get_manifest_path(id)?; + let bytes = self.get_object(key.as_str()).await?; + // TODO: optimize using from_read + let res = rmp_serde::from_slice(bytes.as_ref())?; + Ok(Arc::new(res)) + } + + async fn fetch_chunk(&self, id: &ChunkId, range: &ByteRange) -> StorageResult { + let key = self.get_chunk_path(id)?; + let bytes = self.get_object_range(key.as_str(), range).await?; + Ok(bytes) + } + + async fn write_snapshot( + &self, + id: SnapshotId, + snapshot: Arc, + ) -> StorageResult<()> { + let key = self.get_snapshot_path(&id)?; + let bytes = rmp_serde::to_vec(snapshot.as_ref())?; + let metadata = [( + format_constants::LATEST_ICECHUNK_SNAPSHOT_VERSION_METADATA_KEY, + snapshot.icechunk_snapshot_format_version.to_string(), + )]; + self.put_object( + key.as_str(), + Some(format_constants::LATEST_ICECHUNK_SNAPSHOT_CONTENT_TYPE), + metadata, + bytes, + ) + .await + } + + async fn write_attributes( + &self, + _id: AttributesId, + _table: Arc, + ) -> StorageResult<()> { + todo!() + } + + async fn write_manifests( + &self, + id: ManifestId, + manifest: Arc, + ) -> Result<(), StorageError> { + let key = self.get_manifest_path(&id)?; + let bytes = rmp_serde::to_vec(manifest.as_ref())?; + let metadata = [( + format_constants::LATEST_ICECHUNK_MANIFEST_VERSION_METADATA_KEY, + manifest.icechunk_manifest_format_version.to_string(), + )]; + self.put_object( + key.as_str(), + Some(format_constants::LATEST_ICECHUNK_MANIFEST_CONTENT_TYPE), + metadata, + bytes, + ) + .await + } + + async fn write_chunk( + &self, + id: ChunkId, + bytes: bytes::Bytes, + ) -> Result<(), StorageError> { + let key = self.get_chunk_path(&id)?; + //FIXME: use multipart upload + let metadata: [(String, String); 0] = []; + self.put_object(key.as_str(), None::, metadata, bytes).await + } + + async fn get_ref(&self, ref_key: &str) -> StorageResult { + let key = self.ref_key(ref_key)?; + let res = self + .client + .get_object() + .bucket(self.bucket.clone()) + .key(key.clone()) + .send() + .await; + + match res { + Ok(res) => Ok(res.body.collect().await?.into_bytes()), + Err(err) + if err + .as_service_error() + .map(|e| e.is_no_such_key()) + .unwrap_or(false) => + { + Err(StorageError::RefNotFound(key.to_string())) + } + Err(err) => Err(err.into()), + } + } + + async fn ref_names(&self) -> StorageResult> { + let prefix = self.ref_key("")?; + let mut paginator = self + .client + .list_objects_v2() + .bucket(self.bucket.clone()) + .prefix(prefix.clone()) + .delimiter("/") + .into_paginator() + .send(); + + let mut res = Vec::new(); + + while let Some(page) = paginator.try_next().await? { + for common_prefix in page.common_prefixes() { + if let Some(key) = common_prefix + .prefix() + .as_ref() + .and_then(|key| key.strip_prefix(prefix.as_str())) + .and_then(|key| key.strip_suffix('/')) + { + res.push(key.to_string()); + } + } + } + + Ok(res) + } + + async fn ref_versions( + &self, + ref_name: &str, + ) -> StorageResult>> { + let prefix = self.ref_key(ref_name)?; + let mut paginator = self + .client + .list_objects_v2() + .bucket(self.bucket.clone()) + .prefix(prefix.clone()) + .into_paginator() + .send(); + + let prefix = prefix + "/"; + let stream = try_stream! { + while let Some(page) = paginator.try_next().await? { + for object in page.contents() { + if let Some(key) = object.key.as_ref().and_then(|key| key.strip_prefix(prefix.as_str())) { + yield key.to_string() + } + } + } + }; + Ok(stream.boxed()) + } + + async fn write_ref( + &self, + ref_key: &str, + overwrite_refs: bool, + bytes: Bytes, + ) -> StorageResult<()> { + let key = self.ref_key(ref_key)?; + let mut builder = + self.client.put_object().bucket(self.bucket.clone()).key(key.clone()); + + if !overwrite_refs { + builder = builder.if_none_match("*") + } + + let res = builder.body(bytes.into()).send().await; + + match res { + Ok(_) => Ok(()), + Err(err) => { + let code = err.as_service_error().and_then(|e| e.code()).unwrap_or(""); + if code.contains("PreconditionFailed") + || code.contains("ConditionalRequestConflict") + { + Err(StorageError::RefAlreadyExists(key)) + } else { + Err(err.into()) + } + } + } + } +} diff --git a/icechunk/src/storage/virtual_ref.rs b/icechunk/src/storage/virtual_ref.rs index 4e5f95a8..72aa158e 100644 --- a/icechunk/src/storage/virtual_ref.rs +++ b/icechunk/src/storage/virtual_ref.rs @@ -1,20 +1,21 @@ use crate::format::manifest::{VirtualChunkLocation, VirtualReferenceError}; use crate::format::ByteRange; +use crate::private; use async_trait::async_trait; +use aws_sdk_s3::Client; use bytes::Bytes; -use object_store::{ - aws::AmazonS3Builder, path::Path as ObjectPath, GetOptions, GetRange, ObjectStore, -}; -use std::cmp::{max, min}; -use std::collections::HashMap; +use object_store::local::LocalFileSystem; +use object_store::{path::Path as ObjectPath, GetOptions, GetRange, ObjectStore}; +use serde::{Deserialize, Serialize}; +use std::cmp::min; use std::fmt::Debug; -use std::ops::Bound; -use std::sync::Arc; -use tokio::sync::RwLock; -use url; +use tokio::sync::OnceCell; +use url::{self, Url}; + +use super::s3::{mk_client, range_to_header, S3Config}; #[async_trait] -pub trait VirtualChunkResolver: Debug { +pub trait VirtualChunkResolver: Debug + private::Sealed { async fn fetch_chunk( &self, location: &VirtualChunkLocation, @@ -22,12 +23,86 @@ pub trait VirtualChunkResolver: Debug { ) -> Result; } -#[derive(PartialEq, Eq, Hash, Clone, Debug)] -struct StoreCacheKey(String, String); +#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)] +pub enum ObjectStoreVirtualChunkResolverConfig { + S3(S3Config), +} -#[derive(Debug, Default)] +#[derive(Debug)] pub struct ObjectStoreVirtualChunkResolver { - stores: RwLock>>, + s3: OnceCell, + config: Box>, +} + +impl ObjectStoreVirtualChunkResolver { + pub fn new(config: Option) -> Self { + Self { s3: Default::default(), config: Box::new(config) } + } + + async fn s3(&self) -> &Client { + let config = self.config.clone(); + self.s3 + .get_or_init(|| async move { + match config.as_ref() { + Some(ObjectStoreVirtualChunkResolverConfig::S3(config)) => { + mk_client(Some(config)).await + } + None => mk_client(None).await, + } + }) + .await + } + + async fn fetch_file( + &self, + url: &Url, + range: &ByteRange, + ) -> Result { + let store = LocalFileSystem::new(); + let options = + GetOptions { range: Option::::from(range), ..Default::default() }; + let path = ObjectPath::parse(url.path()) + .map_err(|e| VirtualReferenceError::OtherError(Box::new(e)))?; + + store + .get_opts(&path, options) + .await + .map_err(|e| VirtualReferenceError::FetchError(Box::new(e)))? + .bytes() + .await + .map_err(|e| VirtualReferenceError::FetchError(Box::new(e))) + } + + async fn fetch_s3( + &self, + url: &Url, + range: &ByteRange, + ) -> Result { + let bucket_name = if let Some(host) = url.host_str() { + host.to_string() + } else { + Err(VirtualReferenceError::CannotParseBucketName( + "No bucket name found".to_string(), + ))? + }; + + let key = url.path(); + let key = key.strip_prefix('/').unwrap_or(key); + let mut b = self.s3().await.get_object().bucket(bucket_name).key(key); + + if let Some(header) = range_to_header(range) { + b = b.range(header) + }; + + Ok(b.send() + .await + .map_err(|e| VirtualReferenceError::FetchError(Box::new(e)))? + .body + .collect() + .await + .map_err(|e| VirtualReferenceError::FetchError(Box::new(e)))? + .into_bytes()) + } } // Converts the requested ByteRange to a valid ByteRange appropriate @@ -39,27 +114,27 @@ pub fn construct_valid_byte_range( ) -> ByteRange { // TODO: error for offset<0 // TODO: error if request.start > offset + length - // FIXME: we allow creating a ByteRange(start, end) where end < start - let new_offset = match request.0 { - Bound::Unbounded => chunk_offset, - Bound::Included(start) => max(start, 0) + chunk_offset, - Bound::Excluded(start) => max(start, 0) + chunk_offset + 1, - }; - request.length().map_or( - ByteRange( - Bound::Included(new_offset), - Bound::Excluded(chunk_offset + chunk_length), - ), - |reqlen| { - ByteRange( - Bound::Included(new_offset), - // no request can go past offset + length, so clamp it - Bound::Excluded(min(new_offset + reqlen, chunk_offset + chunk_length)), - ) - }, - ) + match request { + ByteRange::Bounded(std::ops::Range { start: req_start, end: req_end }) => { + let new_start = + min(chunk_offset + req_start, chunk_offset + chunk_length - 1); + let new_end = min(chunk_offset + req_end, chunk_offset + chunk_length); + ByteRange::Bounded(new_start..new_end) + } + ByteRange::From(n) => { + let new_start = min(chunk_offset + n, chunk_offset + chunk_length - 1); + ByteRange::Bounded(new_start..chunk_offset + chunk_length) + } + ByteRange::Last(n) => { + let new_end = chunk_offset + chunk_length; + let new_start = new_end - n; + ByteRange::Bounded(new_start..new_end) + } + } } +impl private::Sealed for ObjectStoreVirtualChunkResolver {} + #[async_trait] impl VirtualChunkResolver for ObjectStoreVirtualChunkResolver { async fn fetch_chunk( @@ -70,56 +145,13 @@ impl VirtualChunkResolver for ObjectStoreVirtualChunkResolver { let VirtualChunkLocation::Absolute(location) = location; let parsed = url::Url::parse(location).map_err(VirtualReferenceError::CannotParseUrl)?; - let bucket_name = parsed - .host_str() - .ok_or(VirtualReferenceError::CannotParseBucketName( - "error parsing bucket name".into(), - ))? - .to_string(); - let path = ObjectPath::parse(parsed.path()) - .map_err(|e| VirtualReferenceError::OtherError(Box::new(e)))?; let scheme = parsed.scheme(); - let cache_key = StoreCacheKey(scheme.into(), bucket_name); - let options = - GetOptions { range: Option::::from(range), ..Default::default() }; - let store = { - let stores = self.stores.read().await; - #[allow(clippy::expect_used)] - stores.get(&cache_key).map(Arc::clone) - }; - let store = match store { - Some(store) => store, - None => { - let builder = match scheme { - // FIXME: allow configuring auth for virtual references - "s3" => AmazonS3Builder::from_env(), - _ => { - Err(VirtualReferenceError::UnsupportedScheme(scheme.to_string()))? - } - }; - let new_store: Arc = Arc::new( - builder - .with_bucket_name(&cache_key.1) - .build() - .map_err(|e| VirtualReferenceError::FetchError(Box::new(e)))?, - ); - { - self.stores - .write() - .await - .insert(cache_key.clone(), Arc::clone(&new_store)); - } - new_store - } - }; - Ok(store - .get_opts(&path, options) - .await - .map_err(|e| VirtualReferenceError::FetchError(Box::new(e)))? - .bytes() - .await - .map_err(|e| VirtualReferenceError::FetchError(Box::new(e)))?) + match scheme { + "file" => self.fetch_file(&parsed, range).await, + "s3" => self.fetch_s3(&parsed, range).await, + _ => Err(VirtualReferenceError::UnsupportedScheme(scheme.to_string())), + } } } @@ -161,43 +193,28 @@ mod tests { // output.length() == requested.length() // output.0 >= chunk_ref.offset prop_assert_eq!( - construct_valid_byte_range( - &ByteRange(Bound::Included(0), Bound::Excluded(length)), - offset, - length, - ), - ByteRange(Bound::Included(offset), Bound::Excluded(max_end)) + construct_valid_byte_range(&ByteRange::Bounded(0..length), offset, length,), + ByteRange::Bounded(offset..max_end) ); prop_assert_eq!( construct_valid_byte_range( - &ByteRange(Bound::Unbounded, Bound::Excluded(length)), + &ByteRange::Bounded(request_offset..max_end), offset, length ), - ByteRange(Bound::Included(offset), Bound::Excluded(max_end)) + ByteRange::Bounded(request_offset + offset..max_end) ); prop_assert_eq!( - construct_valid_byte_range( - &ByteRange(Bound::Included(request_offset), Bound::Excluded(max_end)), - offset, - length - ), - ByteRange(Bound::Included(request_offset + offset), Bound::Excluded(max_end)) + construct_valid_byte_range(&ByteRange::ALL, offset, length), + ByteRange::Bounded(offset..offset + length) ); prop_assert_eq!( - construct_valid_byte_range(&ByteRange::ALL, offset, length), - ByteRange(Bound::Included(offset), Bound::Excluded(max_end)) + construct_valid_byte_range(&ByteRange::From(request_offset), offset, length), + ByteRange::Bounded(offset + request_offset..offset + length) ); prop_assert_eq!( - construct_valid_byte_range( - &ByteRange(Bound::Excluded(request_offset), Bound::Unbounded), - offset, - length - ), - ByteRange( - Bound::Included(offset + request_offset + 1), - Bound::Excluded(max_end) - ) + construct_valid_byte_range(&ByteRange::Last(request_offset), offset, length), + ByteRange::Bounded(offset + length - request_offset..offset + length) ); } } diff --git a/icechunk/src/strategies.rs b/icechunk/src/strategies.rs index 0f9bfcbc..6c306378 100644 --- a/icechunk/src/strategies.rs +++ b/icechunk/src/strategies.rs @@ -1,7 +1,7 @@ use std::num::NonZeroU64; -use std::path::PathBuf; use std::sync::Arc; +use prop::string::string_regex; use proptest::prelude::*; use proptest::{collection::vec, option, strategy::Strategy}; @@ -15,7 +15,10 @@ use crate::{ObjectStorage, Repository}; pub fn node_paths() -> impl Strategy { // FIXME: Add valid paths - any::() + #[allow(clippy::expect_used)] + vec(string_regex("[a-zA-Z0-9]*").expect("invalid regex"), 0..10).prop_map(|v| { + format!("/{}", v.join("/")).try_into().expect("invalid Path string") + }) } prop_compose! { diff --git a/icechunk/src/zarr.rs b/icechunk/src/zarr.rs index 31a86229..bf32991a 100644 --- a/icechunk/src/zarr.rs +++ b/icechunk/src/zarr.rs @@ -1,8 +1,9 @@ use std::{ collections::HashSet, + fmt::Display, iter, num::NonZeroU64, - ops::DerefMut, + ops::{Deref, DerefMut}, path::PathBuf, sync::{ atomic::{AtomicUsize, Ordering}, @@ -20,18 +21,22 @@ use thiserror::Error; use tokio::sync::RwLock; use crate::{ + change_set::ChangeSet, format::{ manifest::VirtualChunkRef, snapshot::{NodeData, UserAttributesSnapshot}, ByteRange, ChunkOffset, IcechunkFormatError, SnapshotId, }, - refs::{BranchVersion, Ref}, + refs::{update_branch, BranchVersion, Ref, RefError}, repository::{ get_chunk, ArrayShape, ChunkIndices, ChunkKeyEncoding, ChunkPayload, ChunkShape, Codec, DataType, DimensionNames, FillValue, Path, RepositoryError, - StorageTransformer, UserAttributes, ZarrArrayMetadata, + RepositoryResult, StorageTransformer, UserAttributes, ZarrArrayMetadata, + }, + storage::{ + s3::{S3Config, S3Storage}, + virtual_ref::ObjectStoreVirtualChunkResolverConfig, }, - storage::object_store::S3Credentials, ObjectStorage, Repository, RepositoryBuilder, SnapshotMetadata, Storage, }; @@ -39,6 +44,7 @@ pub use crate::format::ObjectId; #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] #[serde(tag = "type")] +#[non_exhaustive] pub enum StorageConfig { #[serde(rename = "in_memory")] InMemory { prefix: Option }, @@ -50,13 +56,13 @@ pub enum StorageConfig { S3ObjectStore { bucket: String, prefix: String, - credentials: Option, - endpoint: Option, + #[serde(flatten)] + config: Option, }, } impl StorageConfig { - pub fn make_storage(&self) -> Result, String> { + pub async fn make_storage(&self) -> Result, String> { match self { StorageConfig::InMemory { prefix } => { Ok(Arc::new(ObjectStorage::new_in_memory_store(prefix.clone()))) @@ -66,27 +72,26 @@ impl StorageConfig { .map_err(|e| format!("Error creating storage: {e}"))?; Ok(Arc::new(storage)) } - StorageConfig::S3ObjectStore { bucket, prefix, credentials, endpoint } => { - let storage = ObjectStorage::new_s3_store( - bucket, - prefix, - credentials.clone(), - endpoint.clone(), - ) - .map_err(|e| format!("Error creating storage: {e}"))?; + StorageConfig::S3ObjectStore { bucket, prefix, config } => { + let storage = S3Storage::new_s3_store(bucket, prefix, config.as_ref()) + .await + .map_err(|e| format!("Error creating storage: {e}"))?; Ok(Arc::new(storage)) } } } - pub fn make_cached_storage(&self) -> Result, String> { - let storage = self.make_storage()?; + pub async fn make_cached_storage( + &self, + ) -> Result, String> { + let storage = self.make_storage().await?; let cached_storage = Repository::add_in_mem_asset_caching(storage); Ok(cached_storage) } } #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[non_exhaustive] pub enum VersionInfo { #[serde(rename = "snapshot_id")] SnapshotId(SnapshotId), @@ -102,6 +107,8 @@ pub struct RepositoryConfig { pub version: Option, pub inline_chunk_threshold_bytes: Option, pub unsafe_overwrite_refs: Option, + pub change_set_bytes: Option>, + pub virtual_ref_config: Option, } impl RepositoryConfig { @@ -128,6 +135,19 @@ impl RepositoryConfig { self } + pub fn with_virtual_ref_credentials( + mut self, + config: ObjectStoreVirtualChunkResolverConfig, + ) -> Self { + self.virtual_ref_config = Some(config); + self + } + + pub fn with_change_set_bytes(mut self, change_set_bytes: Vec) -> Self { + self.change_set_bytes = Some(change_set_bytes); + self + } + pub async fn make_repository( &self, storage: Arc, @@ -167,6 +187,14 @@ impl RepositoryConfig { if let Some(value) = self.unsafe_overwrite_refs { builder.with_unsafe_overwrite_refs(value); } + if let Some(config) = &self.virtual_ref_config { + builder.with_virtual_ref_config(config.clone()); + } + if let Some(change_set_bytes) = &self.change_set_bytes { + let change_set = ChangeSet::import_from_bytes(change_set_bytes) + .map_err(|err| format!("Error parsing change set: {err}"))?; + builder.with_change_set(change_set); + } // TODO: add error checking, does the previous version exist? Ok((builder.build(), branch)) @@ -193,7 +221,23 @@ pub struct ConsolidatedStore { pub config: Option, } +impl ConsolidatedStore { + pub fn with_version(mut self, version: VersionInfo) -> Self { + self.repository.version = Some(version); + self + } + + pub fn with_change_set_bytes( + mut self, + change_set: Vec, + ) -> RepositoryResult { + self.repository.change_set_bytes = Some(change_set); + Ok(self) + } +} + #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)] +#[non_exhaustive] pub enum AccessMode { #[serde(rename = "r")] ReadOnly, @@ -201,17 +245,27 @@ pub enum AccessMode { ReadWrite, } +#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +pub enum ListDirItem { + Key(String), + Prefix(String), +} + pub type StoreResult = Result; #[derive(Debug, Clone, PartialEq, Eq, Error)] +#[non_exhaustive] pub enum KeyNotFoundError { #[error("chunk cannot be find for key `{key}`")] ChunkNotFound { key: String, path: Path, coords: ChunkIndices }, #[error("node not found at `{path}`")] NodeNotFound { path: Path }, + #[error("v2 key not found at `{key}`")] + ZarrV2KeyNotFound { key: String }, } #[derive(Debug, Error)] +#[non_exhaustive] pub enum StoreError { #[error("invalid zarr key format `{key}`")] InvalidKey { key: String }, @@ -221,6 +275,8 @@ pub enum StoreError { NotFound(#[from] KeyNotFoundError), #[error("unsuccessful repository operation: `{0}`")] RepositoryError(#[from] RepositoryError), + #[error("unsuccessful ref operation: `{0}`")] + RefError(#[from] RefError), #[error("cannot commit when no snapshot is present")] NoSnapshot, #[error("all commits must be made on a branch")] @@ -256,7 +312,7 @@ impl Store { consolidated: &ConsolidatedStore, mode: AccessMode, ) -> Result { - let storage = consolidated.storage.make_cached_storage()?; + let storage = consolidated.storage.make_cached_storage().await?; let (repository, branch) = consolidated.repository.make_repository(storage).await?; Ok(Self::from_repository(repository, mode, branch, consolidated.config.clone())) @@ -290,6 +346,20 @@ impl Store { } } + /// Creates a new clone of the store with the given access mode. + pub fn with_access_mode(&self, mode: AccessMode) -> Self { + Store { + repository: self.repository.clone(), + mode, + current_branch: self.current_branch.clone(), + config: self.config.clone(), + } + } + + pub fn access_mode(&self) -> &AccessMode { + &self.mode + } + pub fn current_branch(&self) -> &Option { &self.current_branch } @@ -298,6 +368,14 @@ impl Store { self.repository.read().await.snapshot_id().clone() } + pub async fn current_version(&self) -> VersionInfo { + if let Some(branch) = &self.current_branch { + VersionInfo::BranchTipRef(branch.clone()) + } else { + VersionInfo::SnapshotId(self.snapshot_id().await) + } + } + pub async fn has_uncommitted_changes(&self) -> bool { self.repository.read().await.has_uncommitted_changes() } @@ -323,33 +401,28 @@ impl Store { /// /// If there are uncommitted changes, this method will return an error. pub async fn checkout(&mut self, version: VersionInfo) -> StoreResult<()> { - // this needs to be done carefully to avoid deadlocks and race conditions - let storage = { - let guard = self.repository.read().await; - // Checking out is not allowed if there are uncommitted changes - if guard.has_uncommitted_changes() { - return Err(StoreError::UncommittedChanges); - } - guard.storage().clone() - }; + let mut repo = self.repository.write().await; + + // Checking out is not allowed if there are uncommitted changes + if repo.has_uncommitted_changes() { + return Err(StoreError::UncommittedChanges); + } - let repository = match version { + match version { VersionInfo::SnapshotId(sid) => { self.current_branch = None; - Repository::update(storage, sid) + repo.set_snapshot_id(sid); } VersionInfo::TagRef(tag) => { self.current_branch = None; - Repository::from_tag(storage, &tag).await? + repo.set_snapshot_from_tag(tag.as_str()).await? } VersionInfo::BranchTipRef(branch) => { self.current_branch = Some(branch.clone()); - Repository::from_branch_tip(storage, &branch).await? + repo.set_snapshot_from_branch(&branch).await? } } - .build(); - self.repository = Arc::new(RwLock::new(repository)); Ok(()) } @@ -372,16 +445,62 @@ impl Store { Ok((snapshot_id, version)) } + /// Make the current branch point to the given snapshot. + /// This fails if there are uncommitted changes, or if the branch has been updated + /// since checkout. + /// After execution, history of the repo branch will be altered, and the current + /// store will point to a different base snapshot_id + pub async fn reset_branch( + &mut self, + to_snapshot: SnapshotId, + ) -> StoreResult { + // TODO: this should check the snapshot exists + let mut guard = self.repository.write().await; + if guard.has_uncommitted_changes() { + return Err(StoreError::UncommittedChanges); + } + match self.current_branch() { + None => Err(StoreError::NotOnBranch), + Some(branch) => { + let old_snapshot = guard.snapshot_id(); + let storage = guard.storage(); + let overwrite = guard.config().unsafe_overwrite_refs; + let version = update_branch( + storage.as_ref(), + branch.as_str(), + to_snapshot.clone(), + Some(old_snapshot), + overwrite, + ) + .await?; + guard.set_snapshot_id(to_snapshot); + Ok(version) + } + } + } + /// Commit the current changes to the current branch. If the store is not currently /// on a branch, this will return an error. pub async fn commit(&mut self, message: &str) -> StoreResult { + self.distributed_commit(message, vec![]).await + } + + pub async fn distributed_commit<'a, I: IntoIterator>>( + &mut self, + message: &str, + other_changesets_bytes: I, + ) -> StoreResult { if let Some(branch) = &self.current_branch { + let other_change_sets: Vec = other_changesets_bytes + .into_iter() + .map(|v| ChangeSet::import_from_bytes(v.as_slice())) + .try_collect()?; let result = self .repository .write() .await .deref_mut() - .commit(branch, message, None) + .distributed_commit(branch, other_change_sets, message, None) .await?; Ok(result) } else { @@ -405,27 +524,23 @@ impl Store { Ok(futures::stream::iter(all)) } + pub async fn change_set_bytes(&self) -> StoreResult> { + Ok(self.repository.read().await.change_set_bytes()?) + } + pub async fn empty(&self) -> StoreResult { let res = self.repository.read().await.list_nodes().await?.next().is_none(); Ok(res) } pub async fn clear(&mut self) -> StoreResult<()> { - todo!() + let mut repo = self.repository.write().await; + Ok(repo.clear().await?) } - // TODO: prototype argument pub async fn get(&self, key: &str, byte_range: &ByteRange) -> StoreResult { - let bytes = match Key::parse(key)? { - Key::Metadata { node_path } => { - self.get_metadata(key, &node_path, byte_range).await - } - Key::Chunk { node_path, coords } => { - self.get_chunk(key, node_path, coords, byte_range).await - } - }?; - - Ok(bytes) + let repo = self.repository.read().await; + get_key(key, byte_range, repo.deref()).await } /// Get all the requested keys concurrently. @@ -489,13 +604,9 @@ impl Store { res.ok_or(StoreError::PartialValuesPanic) } - // TODO: prototype argument pub async fn exists(&self, key: &str) -> StoreResult { - match self.get(key, &ByteRange::ALL).await { - Ok(_) => Ok(true), - Err(StoreError::NotFound(_)) => Ok(false), - Err(other_error) => Err(other_error), - } + let guard = self.repository.read().await; + exists(key, guard.deref()).await } pub fn supports_writes(&self) -> StoreResult { @@ -507,6 +618,15 @@ impl Store { } pub async fn set(&self, key: &str, value: Bytes) -> StoreResult<()> { + self.set_with_optional_locking(key, value, None).await + } + + async fn set_with_optional_locking( + &self, + key: &str, + value: Bytes, + locked_repo: Option<&mut Repository>, + ) -> StoreResult<()> { if self.mode == AccessMode::ReadOnly { return Err(StoreError::ReadOnly); } @@ -514,29 +634,50 @@ impl Store { match Key::parse(key)? { Key::Metadata { node_path } => { if let Ok(array_meta) = serde_json::from_slice(value.as_ref()) { - self.set_array_meta(node_path, array_meta).await + self.set_array_meta(node_path, array_meta, locked_repo).await } else { match serde_json::from_slice(value.as_ref()) { Ok(group_meta) => { - self.set_group_meta(node_path, group_meta).await + self.set_group_meta(node_path, group_meta, locked_repo).await } Err(err) => Err(StoreError::BadMetadata(err)), } } } Key::Chunk { node_path, coords } => { - // we only lock the repository to get the writer - let writer = self.repository.read().await.get_chunk_writer(); - // then we can write the bytes without holding the lock - let payload = writer(value).await?; - // and finally we lock for write and update the reference - self.repository - .write() - .await - .set_chunk_ref(node_path, coords, Some(payload)) - .await?; + match locked_repo { + Some(repo) => { + let writer = repo.get_chunk_writer(); + let payload = writer(value).await?; + repo.set_chunk_ref(node_path, coords, Some(payload)).await? + } + None => { + // we only lock the repository to get the writer + let writer = self.repository.read().await.get_chunk_writer(); + // then we can write the bytes without holding the lock + let payload = writer(value).await?; + // and finally we lock for write and update the reference + self.repository + .write() + .await + .set_chunk_ref(node_path, coords, Some(payload)) + .await? + } + } Ok(()) } + Key::ZarrV2(_) => Err(StoreError::Unimplemented( + "Icechunk cannot set Zarr V2 metadata keys", + )), + } + } + + pub async fn set_if_not_exists(&self, key: &str, value: Bytes) -> StoreResult<()> { + let mut guard = self.repository.write().await; + if exists(key, guard.deref()).await? { + Ok(()) + } else { + self.set_with_optional_locking(key, value, Some(guard.deref_mut())).await } } @@ -551,10 +692,6 @@ impl Store { } match Key::parse(key)? { - Key::Metadata { .. } => Err(StoreError::NotAllowed(format!( - "use .set to modify metadata for key {}", - key - ))), Key::Chunk { node_path, coords } => { self.repository .write() @@ -567,6 +704,9 @@ impl Store { .await?; Ok(()) } + Key::Metadata { .. } | Key::ZarrV2(_) => Err(StoreError::NotAllowed( + format!("use .set to modify metadata for key {}", key), + )), } } @@ -598,6 +738,7 @@ impl Store { let repository = guard.deref_mut(); Ok(repository.set_chunk_ref(node_path, coords, None).await?) } + Key::ZarrV2(_) => Ok(()), } } @@ -648,6 +789,22 @@ impl Store { // FIXME: this is not lazy, it goes through every chunk. This should be implemented using // metadata only, and ignore the chunks, but we should decide on that based on Zarr3 spec // evolution + let res = self.list_dir_items(prefix).await?.map_ok(|item| match item { + ListDirItem::Key(k) => k, + ListDirItem::Prefix(p) => p, + }); + Ok(res) + } + + pub async fn list_dir_items( + &self, + prefix: &str, + ) -> StoreResult> + Send> { + // TODO: this is inefficient because it filters based on the prefix, instead of only + // generating items that could potentially match + // FIXME: this is not lazy, it goes through every chunk. This should be implemented using + // metadata only, and ignore the chunks, but we should decide on that based on Zarr3 spec + // evolution let idx: usize = if prefix == "/" { 0 } else { prefix.len() }; @@ -655,11 +812,13 @@ impl Store { .list_prefix(prefix) .await? .map_ok(move |s| { - // If the prefix is "/", get rid of it. This can happend when prefix is missing + // If the prefix is "/", get rid of it. This can happen when prefix is missing // the trailing slash (as it does in zarr-python impl) let rem = &s[idx..].trim_start_matches('/'); - let parent = rem.split_once('/').map_or(*rem, |(parent, _)| parent); - parent.to_string() + match rem.split_once('/') { + Some((prefix, _)) => ListDirItem::Prefix(prefix.to_string()), + None => ListDirItem::Key(rem.to_string()), + } }) .try_collect() .await?; @@ -668,99 +827,50 @@ impl Store { Ok(futures::stream::iter(parents.into_iter().map(Ok))) } - async fn get_chunk( + async fn set_array_meta( &self, - key: &str, path: Path, - coords: ChunkIndices, - byte_range: &ByteRange, - ) -> StoreResult { - // we only lock the repository while we get the reader - let reader = self - .repository - .read() - .await - .get_chunk_reader(&path, &coords, byte_range) - .await?; - - // then we can fetch the bytes without holding the lock - let chunk = get_chunk(reader).await?; - chunk.ok_or(StoreError::NotFound(KeyNotFoundError::ChunkNotFound { - key: key.to_string(), - path, - coords, - })) - } - - async fn get_metadata( - &self, - _key: &str, - path: &Path, - range: &ByteRange, - ) -> StoreResult { - let node = self.repository.read().await.get_node(path).await.map_err(|_| { - StoreError::NotFound(KeyNotFoundError::NodeNotFound { path: path.clone() }) - })?; - let user_attributes = match node.user_attributes { - None => None, - Some(UserAttributesSnapshot::Inline(atts)) => Some(atts), - // FIXME: implement - Some(UserAttributesSnapshot::Ref(_)) => todo!(), - }; - let full_metadata = match node.node_data { - NodeData::Group => { - Ok::(GroupMetadata::new(user_attributes).to_bytes()) - } - NodeData::Array(zarr_metadata, _) => { - Ok(ArrayMetadata::new(user_attributes, zarr_metadata).to_bytes()) - } - }?; - - Ok(range.slice(full_metadata)) + array_meta: ArrayMetadata, + locked_repo: Option<&mut Repository>, + ) -> Result<(), StoreError> { + match locked_repo { + Some(repo) => set_array_meta(path, array_meta, repo).await, + None => self.set_array_meta_locking(path, array_meta).await, + } } - async fn set_array_meta( + async fn set_array_meta_locking( &self, path: Path, array_meta: ArrayMetadata, ) -> Result<(), StoreError> { // we need to hold the lock while we search the array and do the update to avoid race // conditions with other writers (notice we don't take &mut self) - let mut guard = self.repository.write().await; - if guard.get_array(&path).await.is_ok() { - // TODO: we don't necessarily need to update both - let repository = guard.deref_mut(); - repository.set_user_attributes(path.clone(), array_meta.attributes).await?; - repository.update_array(path, array_meta.zarr_metadata).await?; - Ok(()) - } else { - let repository = guard.deref_mut(); - repository.add_array(path.clone(), array_meta.zarr_metadata).await?; - repository.set_user_attributes(path, array_meta.attributes).await?; - Ok(()) - } + set_array_meta(path, array_meta, guard.deref_mut()).await } async fn set_group_meta( &self, path: Path, group_meta: GroupMetadata, + locked_repo: Option<&mut Repository>, + ) -> Result<(), StoreError> { + match locked_repo { + Some(repo) => set_group_meta(path, group_meta, repo).await, + None => self.set_group_meta_locking(path, group_meta).await, + } + } + + async fn set_group_meta_locking( + &self, + path: Path, + group_meta: GroupMetadata, ) -> Result<(), StoreError> { - // we need to hold the lock while we search the group and do the update to avoid race + // we need to hold the lock while we search the array and do the update to avoid race // conditions with other writers (notice we don't take &mut self) - // let mut guard = self.repository.write().await; - if guard.get_group(&path).await.is_ok() { - let repository = guard.deref_mut(); - repository.set_user_attributes(path, group_meta.attributes).await?; - Ok(()) - } else { - let repository = guard.deref_mut(); - repository.add_group(path.clone()).await?; - repository.set_user_attributes(path, group_meta.attributes).await?; - Ok(()) - } + set_group_meta(path, group_meta, guard.deref_mut()).await } async fn list_metadata_prefix<'a, 'b: 'a>( @@ -773,9 +883,19 @@ impl Store { for node in repository.list_nodes().await? { // TODO: handle non-utf8? let meta_key = Key::Metadata { node_path: node.path }.to_string(); - if let Some(key) = meta_key { - if key.starts_with(prefix) { - yield key; + match meta_key.strip_prefix(prefix) { + None => {} + Some(rest) => { + // we have a few cases + if prefix.is_empty() // if prefix was empty anything matches + || rest.is_empty() // if stripping prefix left empty we have a match + || rest.starts_with('/') // next component so we match + // what we don't include is other matches, + // we want to catch prefix/foo but not prefix-foo + { + yield meta_key; + } + } } } @@ -797,10 +917,8 @@ impl Store { match maybe_path_chunk { Ok((path,chunk)) => { let chunk_key = Key::Chunk { node_path: path, coords: chunk.coord }.to_string(); - if let Some(key) = chunk_key { - if key.starts_with(prefix) { - yield key; - } + if chunk_key.starts_with(prefix) { + yield chunk_key; } } Err(err) => Err(err)? @@ -811,10 +929,119 @@ impl Store { } } +async fn set_array_meta( + path: Path, + array_meta: ArrayMetadata, + repo: &mut Repository, +) -> Result<(), StoreError> { + if repo.get_array(&path).await.is_ok() { + // TODO: we don't necessarily need to update both + repo.set_user_attributes(path.clone(), array_meta.attributes).await?; + repo.update_array(path, array_meta.zarr_metadata).await?; + Ok(()) + } else { + repo.add_array(path.clone(), array_meta.zarr_metadata).await?; + repo.set_user_attributes(path, array_meta.attributes).await?; + Ok(()) + } +} + +async fn set_group_meta( + path: Path, + group_meta: GroupMetadata, + repo: &mut Repository, +) -> Result<(), StoreError> { + // we need to hold the lock while we search the group and do the update to avoid race + // conditions with other writers (notice we don't take &mut self) + // + if repo.get_group(&path).await.is_ok() { + repo.set_user_attributes(path, group_meta.attributes).await?; + Ok(()) + } else { + repo.add_group(path.clone()).await?; + repo.set_user_attributes(path, group_meta.attributes).await?; + Ok(()) + } +} + +async fn get_metadata( + _key: &str, + path: &Path, + range: &ByteRange, + repo: &Repository, +) -> StoreResult { + let node = repo.get_node(path).await.map_err(|_| { + StoreError::NotFound(KeyNotFoundError::NodeNotFound { path: path.clone() }) + })?; + let user_attributes = match node.user_attributes { + None => None, + Some(UserAttributesSnapshot::Inline(atts)) => Some(atts), + // FIXME: implement + Some(UserAttributesSnapshot::Ref(_)) => todo!(), + }; + let full_metadata = match node.node_data { + NodeData::Group => { + Ok::(GroupMetadata::new(user_attributes).to_bytes()) + } + NodeData::Array(zarr_metadata, _) => { + Ok(ArrayMetadata::new(user_attributes, zarr_metadata).to_bytes()) + } + }?; + + Ok(range.slice(full_metadata)) +} + +async fn get_chunk_bytes( + key: &str, + path: Path, + coords: ChunkIndices, + byte_range: &ByteRange, + repo: &Repository, +) -> StoreResult { + let reader = repo.get_chunk_reader(&path, &coords, byte_range).await?; + + // then we can fetch the bytes without holding the lock + let chunk = get_chunk(reader).await?; + chunk.ok_or(StoreError::NotFound(KeyNotFoundError::ChunkNotFound { + key: key.to_string(), + path, + coords, + })) +} + +async fn get_key( + key: &str, + byte_range: &ByteRange, + repo: &Repository, +) -> StoreResult { + let bytes = match Key::parse(key)? { + Key::Metadata { node_path } => { + get_metadata(key, &node_path, byte_range, repo).await + } + Key::Chunk { node_path, coords } => { + get_chunk_bytes(key, node_path, coords, byte_range, repo).await + } + Key::ZarrV2(key) => { + Err(StoreError::NotFound(KeyNotFoundError::ZarrV2KeyNotFound { key })) + } + }?; + + Ok(bytes) +} + +async fn exists(key: &str, repo: &Repository) -> StoreResult { + match get_key(key, &ByteRange::ALL, repo).await { + Ok(_) => Ok(true), + Err(StoreError::NotFound(_)) => Ok(false), + Err(other_error) => Err(other_error), + } +} + #[derive(Debug, Clone, PartialEq, Eq)] enum Key { Metadata { node_path: Path }, Chunk { node_path: Path, coords: ChunkIndices }, + ZarrV2(String), } impl Key { @@ -824,9 +1051,21 @@ impl Key { fn parse(key: &str) -> Result { fn parse_chunk(key: &str) -> Result { + if key == ".zgroup" + || key == ".zarray" + || key == ".zattrs" + || key == ".zmetadata" + || key.ends_with("/.zgroup") + || key.ends_with("/.zarray") + || key.ends_with("/.zattrs") + || key.ends_with("/.zmetadata") + { + return Ok(Key::ZarrV2(key.to_string())); + } + if key == "c" { return Ok(Key::Chunk { - node_path: "/".into(), + node_path: Path::root(), coords: ChunkIndices(vec![]), }); } @@ -834,10 +1073,15 @@ impl Key { let path = path.strip_suffix('/').unwrap_or(path); if coords.is_empty() { Ok(Key::Chunk { - node_path: ["/", path].iter().collect(), + node_path: format!("/{path}").try_into().map_err(|_| { + StoreError::InvalidKey { key: key.to_string() } + })?, coords: ChunkIndices(vec![]), }) } else { + let absolute = format!("/{path}") + .try_into() + .map_err(|_| StoreError::InvalidKey { key: key.to_string() })?; coords .strip_prefix('/') .ok_or(StoreError::InvalidKey { key: key.to_string() })? @@ -845,7 +1089,7 @@ impl Key { .map(|s| s.parse::()) .collect::, _>>() .map(|coords| Key::Chunk { - node_path: ["/", path].iter().collect(), + node_path: absolute, coords: ChunkIndices(coords), }) .map_err(|_| StoreError::InvalidKey { key: key.to_string() }) @@ -856,31 +1100,39 @@ impl Key { } if key == Key::ROOT_KEY { - Ok(Key::Metadata { node_path: "/".into() }) + Ok(Key::Metadata { node_path: Path::root() }) } else if let Some(path) = key.strip_suffix(Key::METADATA_SUFFIX) { // we need to be careful indexing into utf8 strings - Ok(Key::Metadata { node_path: ["/", path].iter().collect() }) + Ok(Key::Metadata { + node_path: format!("/{path}") + .try_into() + .map_err(|_| StoreError::InvalidKey { key: key.to_string() })?, + }) } else { parse_chunk(key) } } +} - fn to_string(&self) -> Option { +impl Display for Key { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - Key::Metadata { node_path } => node_path.as_path().to_str().map(|s| { - format!("{}{}", &s[1..], Key::METADATA_SUFFIX) - .trim_start_matches('/') - .to_string() - }), + Key::Metadata { node_path } => { + let s = + format!("{}{}", &node_path.to_string()[1..], Key::METADATA_SUFFIX) + .trim_start_matches('/') + .to_string(); + f.write_str(s.as_str()) + } Key::Chunk { node_path, coords } => { - node_path.as_path().to_str().map(|path| { - let coords = coords.0.iter().map(|c| c.to_string()).join("/"); - [path[1..].to_string(), "c".to_string(), coords] - .iter() - .filter(|s| !s.is_empty()) - .join("/") - }) + let coords = coords.0.iter().map(|c| c.to_string()).join("/"); + let s = [node_path.to_string()[1..].to_string(), "c".to_string(), coords] + .iter() + .filter(|s| !s.is_empty()) + .join("/"); + f.write_str(s.as_str()) } + Key::ZarrV2(key) => f.write_str(key.as_str()), } } } @@ -891,6 +1143,7 @@ struct ArrayMetadata { zarr_format: u8, #[serde(deserialize_with = "validate_array_node_type")] node_type: String, + #[serde(skip_serializing_if = "Option::is_none")] attributes: Option, #[serde(flatten)] #[serde_as(as = "TryFromInto")] @@ -1006,7 +1259,8 @@ impl From for ZarrArrayMetadataSerialzer { } FillValue::Complex64(r, i) => ([r, i].as_ref()).into(), FillValue::Complex128(r, i) => ([r, i].as_ref()).into(), - FillValue::RawBits(r) => r.into(), + FillValue::String(s) => s.into(), + FillValue::Bytes(b) => b.into(), } } @@ -1030,6 +1284,7 @@ struct GroupMetadata { zarr_format: u8, #[serde(deserialize_with = "validate_group_node_type")] node_type: String, + #[serde(skip_serializing_if = "Option::is_none")] attributes: Option, } @@ -1188,6 +1443,8 @@ mod tests { use std::borrow::BorrowMut; + use crate::storage::s3::{S3Credentials, StaticS3Credentials}; + use super::*; use pretty_assertions::assert_eq; @@ -1212,91 +1469,132 @@ mod tests { fn test_parse_key() { assert!(matches!( Key::parse("zarr.json"), - Ok(Key::Metadata { node_path}) if node_path.to_str() == Some("/") + Ok(Key::Metadata { node_path}) if node_path.to_string() == "/" )); assert!(matches!( Key::parse("a/zarr.json"), - Ok(Key::Metadata { node_path }) if node_path.to_str() == Some("/a") + Ok(Key::Metadata { node_path }) if node_path.to_string() == "/a" )); assert!(matches!( Key::parse("a/b/c/zarr.json"), - Ok(Key::Metadata { node_path }) if node_path.to_str() == Some("/a/b/c") + Ok(Key::Metadata { node_path }) if node_path.to_string() == "/a/b/c" )); assert!(matches!( Key::parse("foo/c"), - Ok(Key::Chunk { node_path, coords }) if node_path.to_str() == Some("/foo") && coords == ChunkIndices(vec![]) + Ok(Key::Chunk { node_path, coords }) if node_path.to_string() == "/foo" && coords == ChunkIndices(vec![]) )); assert!(matches!( Key::parse("foo/bar/c"), - Ok(Key::Chunk { node_path, coords}) if node_path.to_str() == Some("/foo/bar") && coords == ChunkIndices(vec![]) + Ok(Key::Chunk { node_path, coords}) if node_path.to_string() == "/foo/bar" && coords == ChunkIndices(vec![]) )); assert!(matches!( Key::parse("foo/c/1/2/3"), Ok(Key::Chunk { node_path, coords, - }) if node_path.to_str() == Some("/foo") && coords == ChunkIndices(vec![1,2,3]) + }) if node_path.to_string() == "/foo" && coords == ChunkIndices(vec![1,2,3]) )); assert!(matches!( Key::parse("foo/bar/baz/c/1/2/3"), Ok(Key::Chunk { node_path, coords, - }) if node_path.to_str() == Some("/foo/bar/baz") && coords == ChunkIndices(vec![1,2,3]) + }) if node_path.to_string() == "/foo/bar/baz" && coords == ChunkIndices(vec![1,2,3]) )); assert!(matches!( Key::parse("c"), - Ok(Key::Chunk { node_path, coords}) if node_path.to_str() == Some("/") && coords == ChunkIndices(vec![]) + Ok(Key::Chunk { node_path, coords}) if node_path.to_string() == "/" && coords == ChunkIndices(vec![]) )); assert!(matches!( Key::parse("c/0/0"), - Ok(Key::Chunk { node_path, coords}) if node_path.to_str() == Some("/") && coords == ChunkIndices(vec![0,0]) + Ok(Key::Chunk { node_path, coords}) if node_path.to_string() == "/" && coords == ChunkIndices(vec![0,0]) + )); + assert!(matches!( + Key::parse(".zarray"), + Ok(Key::ZarrV2(s) ) if s == ".zarray" + )); + assert!(matches!( + Key::parse(".zgroup"), + Ok(Key::ZarrV2(s) ) if s == ".zgroup" + )); + assert!(matches!( + Key::parse(".zattrs"), + Ok(Key::ZarrV2(s) ) if s == ".zattrs" + )); + assert!(matches!( + Key::parse(".zmetadata"), + Ok(Key::ZarrV2(s) ) if s == ".zmetadata" + )); + assert!(matches!( + Key::parse("foo/.zgroup"), + Ok(Key::ZarrV2(s) ) if s == "foo/.zgroup" + )); + assert!(matches!( + Key::parse("foo/bar/.zarray"), + Ok(Key::ZarrV2(s) ) if s == "foo/bar/.zarray" + )); + assert!(matches!( + Key::parse("foo/.zmetadata"), + Ok(Key::ZarrV2(s) ) if s == "foo/.zmetadata" + )); + assert!(matches!( + Key::parse("foo/.zattrs"), + Ok(Key::ZarrV2(s) ) if s == "foo/.zattrs" )); } #[test] fn test_format_key() { assert_eq!( - Key::Metadata { node_path: "/".into() }.to_string(), - Some("zarr.json".to_string()) + Key::Metadata { node_path: Path::root() }.to_string(), + "zarr.json".to_string() ); assert_eq!( - Key::Metadata { node_path: "/a".into() }.to_string(), - Some("a/zarr.json".to_string()) + Key::Metadata { node_path: "/a".try_into().unwrap() }.to_string(), + "a/zarr.json".to_string() ); assert_eq!( - Key::Metadata { node_path: "/a/b/c".into() }.to_string(), - Some("a/b/c/zarr.json".to_string()) + Key::Metadata { node_path: "/a/b/c".try_into().unwrap() }.to_string(), + "a/b/c/zarr.json".to_string() ); assert_eq!( - Key::Chunk { node_path: "/".into(), coords: ChunkIndices(vec![]) } + Key::Chunk { node_path: Path::root(), coords: ChunkIndices(vec![]) } .to_string(), - Some("c".to_string()) + "c".to_string() ); assert_eq!( - Key::Chunk { node_path: "/".into(), coords: ChunkIndices(vec![0]) } + Key::Chunk { node_path: Path::root(), coords: ChunkIndices(vec![0]) } .to_string(), - Some("c/0".to_string()) + "c/0".to_string() ); assert_eq!( - Key::Chunk { node_path: "/".into(), coords: ChunkIndices(vec![1, 2]) } + Key::Chunk { node_path: Path::root(), coords: ChunkIndices(vec![1, 2]) } .to_string(), - Some("c/1/2".to_string()) + "c/1/2".to_string() ); assert_eq!( - Key::Chunk { node_path: "/a".into(), coords: ChunkIndices(vec![]) } - .to_string(), - Some("a/c".to_string()) + Key::Chunk { + node_path: "/a".try_into().unwrap(), + coords: ChunkIndices(vec![]) + } + .to_string(), + "a/c".to_string() ); assert_eq!( - Key::Chunk { node_path: "/a".into(), coords: ChunkIndices(vec![1]) } - .to_string(), - Some("a/c/1".to_string()) + Key::Chunk { + node_path: "/a".try_into().unwrap(), + coords: ChunkIndices(vec![1]) + } + .to_string(), + "a/c/1".to_string() ); assert_eq!( - Key::Chunk { node_path: "/a".into(), coords: ChunkIndices(vec![1, 2]) } - .to_string(), - Some("a/c/1/2".to_string()) + Key::Chunk { + node_path: "/a".try_into().unwrap(), + coords: ChunkIndices(vec![1, 2]) + } + .to_string(), + "a/c/1/2".to_string() ); } @@ -1419,7 +1717,7 @@ mod tests { assert!(matches!( store.get("zarr.json", &ByteRange::ALL).await, - Err(StoreError::NotFound(KeyNotFoundError::NodeNotFound {path})) if path.to_str() == Some("/") + Err(StoreError::NotFound(KeyNotFoundError::NodeNotFound {path})) if path.to_string() == "/" )); store @@ -1430,9 +1728,7 @@ mod tests { .await?; assert_eq!( store.get("zarr.json", &ByteRange::ALL).await.unwrap(), - Bytes::copy_from_slice( - br#"{"zarr_format":3,"node_type":"group","attributes":null}"# - ) + Bytes::copy_from_slice(br#"{"zarr_format":3,"node_type":"group"}"#) ); store.set("a/b/zarr.json", Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group", "attributes": {"spam":"ham", "eggs":42}}"#)).await?; @@ -1483,14 +1779,14 @@ mod tests { assert!(matches!( store.get("array/zarr.json", &ByteRange::ALL).await, Err(StoreError::NotFound(KeyNotFoundError::NodeNotFound { path })) - if path.to_str() == Some("/array"), + if path.to_string() == "/array", )); store.set("array/zarr.json", zarr_meta.clone()).await.unwrap(); store.delete("array/zarr.json").await.unwrap(); assert!(matches!( store.get("array/zarr.json", &ByteRange::ALL).await, Err(StoreError::NotFound(KeyNotFoundError::NodeNotFound { path } )) - if path.to_str() == Some("/array"), + if path.to_string() == "/array", )); store.set("array/zarr.json", Bytes::copy_from_slice(group_data)).await.unwrap(); @@ -1629,7 +1925,7 @@ mod tests { assert!(matches!( store.get("array/c/0/1/0", &ByteRange::ALL).await, Err(StoreError::NotFound(KeyNotFoundError::ChunkNotFound { key, path, coords })) - if key == "array/c/0/1/0" && path.to_str() == Some("/array") && coords == ChunkIndices([0, 1, 0].to_vec()) + if key == "array/c/0/1/0" && path.to_string() == "/array" && coords == ChunkIndices([0, 1, 0].to_vec()) )); assert!(matches!( store.delete("array/foo").await, @@ -1830,21 +2126,112 @@ mod tests { dir.sort(); assert_eq!(dir, vec!["array".to_string(), "zarr.json".to_string()]); + let mut dir = store.list_dir_items("/").await?.try_collect::>().await?; + dir.sort(); + assert_eq!( + dir, + vec![ + ListDirItem::Key("zarr.json".to_string()), + ListDirItem::Prefix("array".to_string()) + ] + ); + let mut dir = store.list_dir("array").await?.try_collect::>().await?; dir.sort(); assert_eq!(dir, vec!["c".to_string(), "zarr.json".to_string()]); + let mut dir = + store.list_dir_items("array").await?.try_collect::>().await?; + dir.sort(); + assert_eq!( + dir, + vec![ + ListDirItem::Key("zarr.json".to_string()), + ListDirItem::Prefix("c".to_string()) + ] + ); + let mut dir = store.list_dir("array/").await?.try_collect::>().await?; dir.sort(); assert_eq!(dir, vec!["c".to_string(), "zarr.json".to_string()]); + let mut dir = + store.list_dir_items("array/").await?.try_collect::>().await?; + dir.sort(); + assert_eq!( + dir, + vec![ + ListDirItem::Key("zarr.json".to_string()), + ListDirItem::Prefix("c".to_string()) + ] + ); + let mut dir = store.list_dir("array/c/").await?.try_collect::>().await?; dir.sort(); assert_eq!(dir, vec!["0".to_string(), "1".to_string()]); + let mut dir = + store.list_dir_items("array/c/").await?.try_collect::>().await?; + dir.sort(); + assert_eq!( + dir, + vec![ + ListDirItem::Prefix("0".to_string()), + ListDirItem::Prefix("1".to_string()), + ] + ); + let mut dir = store.list_dir("array/c/1/").await?.try_collect::>().await?; dir.sort(); assert_eq!(dir, vec!["1".to_string()]); + + let mut dir = + store.list_dir_items("array/c/1/").await?.try_collect::>().await?; + dir.sort(); + assert_eq!(dir, vec![ListDirItem::Prefix("1".to_string()),]); + Ok(()) + } + + #[tokio::test] + async fn test_list_dir_with_prefix() -> Result<(), Box> { + let storage: Arc = + Arc::new(ObjectStorage::new_in_memory_store(Some("prefix".into()))); + let ds = Repository::init(Arc::clone(&storage), false).await?.build(); + let mut store = Store::from_repository( + ds, + AccessMode::ReadWrite, + Some("main".to_string()), + None, + ); + + store + .borrow_mut() + .set( + "zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await?; + + store + .borrow_mut() + .set( + "group/zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await?; + + store + .borrow_mut() + .set( + "group-suffix/zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await?; + + assert_eq!( + store.list_dir("group/").await?.try_collect::>().await?, + vec!["zarr.json"] + ); Ok(()) } @@ -1930,12 +2317,18 @@ mod tests { store.set("array/zarr.json", zarr_meta.clone()).await.unwrap(); let data = Bytes::copy_from_slice(b"hello"); - store.set("array/c/0/1/0", data.clone()).await.unwrap(); + store.set_if_not_exists("array/c/0/1/0", data.clone()).await.unwrap(); + assert_eq!(store.get("array/c/0/1/0", &ByteRange::ALL).await.unwrap(), data); let snapshot_id = store.commit("initial commit").await.unwrap(); let new_data = Bytes::copy_from_slice(b"world"); + store.set_if_not_exists("array/c/0/1/0", new_data.clone()).await.unwrap(); + assert_eq!(store.get("array/c/0/1/0", &ByteRange::ALL).await.unwrap(), data); + store.set("array/c/0/1/0", new_data.clone()).await.unwrap(); + assert_eq!(store.get("array/c/0/1/0", &ByteRange::ALL).await.unwrap(), new_data); + let new_snapshot_id = store.commit("update").await.unwrap(); store.checkout(VersionInfo::SnapshotId(snapshot_id.clone())).await.unwrap(); @@ -1954,7 +2347,7 @@ mod tests { store.reset().await?; assert_eq!(store.get("array/c/0/1/0", &ByteRange::ALL).await.unwrap(), new_data); - // TODO: Create a new branch and do stuff with it + // Create a new branch and do stuff with it store.new_branch("dev").await?; store.set("array/c/0/1/0", new_data.clone()).await?; let dev_snapshot_id = store.commit("update dev branch").await?; @@ -1975,6 +2368,180 @@ mod tests { Ok(()) } + #[tokio::test] + async fn test_clear() -> Result<(), Box> { + let storage: Arc = + Arc::new(ObjectStorage::new_in_memory_store(Some("prefix".into()))); + + let mut store = Store::new_from_storage(Arc::clone(&storage)).await?; + + store + .set( + "zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await + .unwrap(); + + let empty: Vec = Vec::new(); + store.clear().await?; + assert_eq!( + store.list_prefix("").await?.try_collect::>().await?, + empty + ); + + store + .set( + "zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await + .unwrap(); + store + .set( + "group/zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await + .unwrap(); + let zarr_meta = Bytes::copy_from_slice(br#"{"zarr_format":3,"node_type":"array","attributes":{"foo":42},"shape":[2,2,2],"data_type":"int32","chunk_grid":{"name":"regular","configuration":{"chunk_shape":[1,1,1]}},"chunk_key_encoding":{"name":"default","configuration":{"separator":"/"}},"fill_value":0,"codecs":[{"name":"mycodec","configuration":{"foo":42}}],"storage_transformers":[{"name":"mytransformer","configuration":{"bar":43}}],"dimension_names":["x","y","t"]}"#); + let new_data = Bytes::copy_from_slice(b"world"); + store.set("array/zarr.json", zarr_meta.clone()).await.unwrap(); + store.set("group/array/zarr.json", zarr_meta.clone()).await.unwrap(); + store.set("array/c/1/0/0", new_data.clone()).await.unwrap(); + store.set("group/array/c/1/0/0", new_data.clone()).await.unwrap(); + + let _ = store.commit("initial commit").await.unwrap(); + + store + .set( + "group/group2/zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await + .unwrap(); + store.set("group/group2/array/zarr.json", zarr_meta.clone()).await.unwrap(); + store.set("group/group2/array/c/1/0/0", new_data.clone()).await.unwrap(); + + store.clear().await?; + + assert_eq!( + store.list_prefix("").await?.try_collect::>().await?, + empty + ); + + let empty_snap = store.commit("no content commit").await.unwrap(); + + assert_eq!( + store.list_prefix("").await?.try_collect::>().await?, + empty + ); + + let store = Store::from_repository( + Repository::update(Arc::clone(&storage), empty_snap).build(), + AccessMode::ReadWrite, + None, + None, + ); + assert_eq!( + store.list_prefix("").await?.try_collect::>().await?, + empty + ); + + Ok(()) + } + + #[tokio::test] + async fn test_branch_reset() -> Result<(), Box> { + let storage: Arc = + Arc::new(ObjectStorage::new_in_memory_store(Some("prefix".into()))); + + let mut store = Store::new_from_storage(Arc::clone(&storage)).await?; + + store + .set( + "zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await + .unwrap(); + + store.commit("root group").await.unwrap(); + + store + .set( + "a/zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await + .unwrap(); + + let prev_snap = store.commit("group a").await?; + + store + .set( + "b/zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await + .unwrap(); + + store.commit("group b").await?; + assert!(store.exists("a/zarr.json").await?); + assert!(store.exists("b/zarr.json").await?); + + store.reset_branch(prev_snap).await?; + + assert!(!store.exists("b/zarr.json").await?); + assert!(store.exists("a/zarr.json").await?); + + let (repo, _) = + RepositoryConfig::existing(VersionInfo::BranchTipRef("main".to_string())) + .make_repository(storage) + .await?; + let store = Store::from_repository( + repo, + AccessMode::ReadOnly, + Some("main".to_string()), + None, + ); + assert!(!store.exists("b/zarr.json").await?); + assert!(store.exists("a/zarr.json").await?); + Ok(()) + } + + #[tokio::test] + async fn test_access_mode() { + let storage: Arc = + Arc::new(ObjectStorage::new_in_memory_store(Some("prefix".into()))); + + let writeable_store = + Store::new_from_storage(Arc::clone(&storage)).await.unwrap(); + assert_eq!(writeable_store.access_mode(), &AccessMode::ReadWrite); + + writeable_store + .set( + "zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await + .unwrap(); + + let readable_store = writeable_store.with_access_mode(AccessMode::ReadOnly); + assert_eq!(readable_store.access_mode(), &AccessMode::ReadOnly); + + let result = readable_store + .set( + "zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await; + let correct_error = matches!(result, Err(StoreError::ReadOnly { .. })); + assert!(correct_error); + + readable_store.get("zarr.json", &ByteRange::ALL).await.unwrap(); + } + #[test] fn test_store_config_deserialization() -> Result<(), Box> { let expected = ConsolidatedStore { @@ -1985,6 +2552,8 @@ mod tests { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, ]))), unsafe_overwrite_refs: Some(true), + change_set_bytes: None, + virtual_ref_config: None, }, config: Some(StoreOptions { get_partial_values_concurrency: 100 }), }; @@ -2018,6 +2587,8 @@ mod tests { version: None, inline_chunk_threshold_bytes: None, unsafe_overwrite_refs: None, + change_set_bytes: None, + virtual_ref_config: None, }, config: None, ..expected.clone() @@ -2037,6 +2608,8 @@ mod tests { version: None, inline_chunk_threshold_bytes: None, unsafe_overwrite_refs: None, + change_set_bytes: None, + virtual_ref_config: None, }, config: None, ..expected.clone() @@ -2055,6 +2628,8 @@ mod tests { version: None, inline_chunk_threshold_bytes: None, unsafe_overwrite_refs: None, + change_set_bytes: None, + virtual_ref_config: None, }, storage: StorageConfig::InMemory { prefix: Some("prefix".to_string()) }, config: None, @@ -2073,6 +2648,8 @@ mod tests { version: None, inline_chunk_threshold_bytes: None, unsafe_overwrite_refs: None, + change_set_bytes: None, + virtual_ref_config: None, }, storage: StorageConfig::InMemory { prefix: None }, config: None, @@ -2091,12 +2668,13 @@ mod tests { version: None, inline_chunk_threshold_bytes: None, unsafe_overwrite_refs: None, + change_set_bytes: None, + virtual_ref_config: None, }, storage: StorageConfig::S3ObjectStore { bucket: String::from("test"), prefix: String::from("root"), - credentials: None, - endpoint: None + config: None, }, config: None, }, @@ -2109,10 +2687,12 @@ mod tests { "bucket":"test", "prefix":"root", "credentials":{ + "type":"static", "access_key_id":"my-key", "secret_access_key":"my-secret-key" }, - "endpoint":"http://localhost:9000" + "endpoint":"http://localhost:9000", + "allow_http": true }, "repository": {} } @@ -2123,16 +2703,22 @@ mod tests { version: None, inline_chunk_threshold_bytes: None, unsafe_overwrite_refs: None, + change_set_bytes: None, + virtual_ref_config: None, }, storage: StorageConfig::S3ObjectStore { bucket: String::from("test"), prefix: String::from("root"), - credentials: Some(S3Credentials { - access_key_id: String::from("my-key"), - secret_access_key: String::from("my-secret-key"), - session_token: None, - }), - endpoint: Some(String::from("http://localhost:9000")) + config: Some(S3Config { + region: None, + endpoint: Some(String::from("http://localhost:9000")), + credentials: S3Credentials::Static(StaticS3Credentials { + access_key_id: String::from("my-key"), + secret_access_key: String::from("my-secret-key"), + session_token: None, + }), + allow_http: true, + }) }, config: None, }, diff --git a/icechunk/tests/test_concurrency.rs b/icechunk/tests/test_concurrency.rs index 2e07b2cb..a5776be5 100644 --- a/icechunk/tests/test_concurrency.rs +++ b/icechunk/tests/test_concurrency.rs @@ -1,4 +1,4 @@ -#![allow(clippy::expect_used)] +#![allow(clippy::expect_used, clippy::unwrap_used)] use bytes::Bytes; use icechunk::{ format::{ByteRange, ChunkIndices, Path}, @@ -33,7 +33,7 @@ async fn test_concurrency() -> Result<(), Box> { Arc::new(ObjectStorage::new_in_memory_store(Some("prefix".into()))); let mut ds = Repository::init(Arc::clone(&storage), false).await?.build(); - ds.add_group("/".into()).await?; + ds.add_group(Path::root()).await?; let zarr_meta = ZarrArrayMetadata { shape: vec![N as u64, N as u64], data_type: DataType::Float64, @@ -51,7 +51,7 @@ async fn test_concurrency() -> Result<(), Box> { dimension_names: Some(vec![Some("x".to_string()), Some("y".to_string())]), }; - let new_array_path: Path = "/array".to_string().into(); + let new_array_path: Path = "/array".try_into().unwrap(); ds.add_array(new_array_path.clone(), zarr_meta.clone()).await?; let ds = Arc::new(RwLock::new(ds)); @@ -101,7 +101,11 @@ async fn write_task(ds: Arc>, x: u32, y: u32) { ds.write() .await - .set_chunk_ref("/array".into(), ChunkIndices(vec![x, y]), Some(payload)) + .set_chunk_ref( + "/array".try_into().unwrap(), + ChunkIndices(vec![x, y]), + Some(payload), + ) .await .expect("Failed to write chunk ref"); } @@ -114,7 +118,7 @@ async fn read_task(ds: Arc>, x: u32, y: u32, barrier: Arc>, x: u32, y: u32, barrier: Arc { if bytes == &expected_bytes { @@ -159,7 +162,7 @@ async fn list_task(ds: Arc>, barrier: Arc) { .list_nodes() .await .expect("list_nodes failed") - .map(|n| n.path.as_path().to_string_lossy().into_owned()) + .map(|n| n.path.to_string()) .collect::>(); assert_eq!(expected_nodes, nodes); diff --git a/icechunk/tests/test_distributed_writes.rs b/icechunk/tests/test_distributed_writes.rs new file mode 100644 index 00000000..49cafb2c --- /dev/null +++ b/icechunk/tests/test_distributed_writes.rs @@ -0,0 +1,195 @@ +#![allow(clippy::unwrap_used)] +use pretty_assertions::assert_eq; +use std::{num::NonZeroU64, ops::Range, sync::Arc}; + +use bytes::Bytes; +use icechunk::{ + format::{ByteRange, ChunkIndices, Path, SnapshotId}, + metadata::{ChunkKeyEncoding, ChunkShape, DataType, FillValue}, + repository::{get_chunk, ChangeSet, ZarrArrayMetadata}, + storage::s3::{S3Config, S3Credentials, S3Storage, StaticS3Credentials}, + Repository, Storage, +}; +use tokio::task::JoinSet; + +const SIZE: usize = 10; + +async fn mk_storage( + prefix: &str, +) -> Result, Box> { + let storage: Arc = Arc::new( + S3Storage::new_s3_store( + "testbucket", + prefix, + Some(&S3Config { + region: Some("us-east-1".to_string()), + endpoint: Some("http://localhost:9000".to_string()), + credentials: S3Credentials::Static(StaticS3Credentials { + access_key_id: "minio123".into(), + secret_access_key: "minio123".into(), + session_token: None, + }), + allow_http: true, + }), + ) + .await?, + ); + Ok(Repository::add_in_mem_asset_caching(storage)) +} + +async fn mk_repo( + storage: Arc, + init: bool, +) -> Result> { + if init { + Ok(Repository::init(storage, false).await?.with_inline_threshold_bytes(1).build()) + } else { + Ok(Repository::from_branch_tip(storage, "main") + .await? + .with_inline_threshold_bytes(1) + .build()) + } +} + +async fn write_chunks( + mut repo: Repository, + xs: Range, + ys: Range, +) -> Result> { + for x in xs { + for y in ys.clone() { + let fx = x as f64; + let fy = y as f64; + let bytes: Vec = fx + .to_le_bytes() + .into_iter() + .chain(fy.to_le_bytes().into_iter()) + .collect(); + let payload = + repo.get_chunk_writer()(Bytes::copy_from_slice(bytes.as_slice())).await?; + repo.set_chunk_ref( + "/array".try_into().unwrap(), + ChunkIndices(vec![x, y]), + Some(payload), + ) + .await?; + } + } + Ok(repo) +} + +async fn verify( + repo: Repository, +) -> Result<(), Box> { + for x in 0..(SIZE / 2) as u32 { + for y in 0..(SIZE / 2) as u32 { + let bytes = get_chunk( + repo.get_chunk_reader( + &"/array".try_into().unwrap(), + &ChunkIndices(vec![x, y]), + &ByteRange::ALL, + ) + .await?, + ) + .await?; + assert!(bytes.is_some()); + let bytes = bytes.unwrap(); + let written_x = f64::from_le_bytes(bytes[0..8].try_into().unwrap()); + let written_y = f64::from_le_bytes(bytes[8..16].try_into().unwrap()); + assert_eq!(x as f64, written_x); + assert_eq!(y as f64, written_y); + } + } + Ok(()) +} + +#[tokio::test] +#[allow(clippy::unwrap_used)] +/// This test does a distributed write from 4 different [`Repository`] instances, and then commits. +/// +/// - We create a repo, and write an empty array to it. +/// - We commit to it +/// - We initialize 3 other Repos pointing to the same place +/// - We do concurrent writes from the 4 repo instances +/// - When done, we do a distributed commit using a random repo +/// - The changes from the other repos are serialized via [`ChangeSet::export_to_bytes`] +async fn test_distributed_writes() -> Result<(), Box> +{ + let prefix = format!("test_distributed_writes_{}", SnapshotId::random()); + let storage1 = mk_storage(prefix.as_str()).await?; + let storage2 = mk_storage(prefix.as_str()).await?; + let storage3 = mk_storage(prefix.as_str()).await?; + let storage4 = mk_storage(prefix.as_str()).await?; + let mut repo1 = mk_repo(storage1, true).await?; + + let zarr_meta = ZarrArrayMetadata { + shape: vec![SIZE as u64, SIZE as u64], + data_type: DataType::Float64, + chunk_shape: ChunkShape(vec![ + NonZeroU64::new(2).unwrap(), + NonZeroU64::new(2).unwrap(), + ]), + chunk_key_encoding: ChunkKeyEncoding::Slash, + fill_value: FillValue::Float64(f64::NAN), + codecs: vec![], + storage_transformers: None, + dimension_names: None, + }; + + let new_array_path: Path = "/array".try_into().unwrap(); + repo1.add_array(new_array_path.clone(), zarr_meta.clone()).await?; + repo1.commit("main", "create array", None).await?; + + let repo2 = mk_repo(storage2, false).await?; + let repo3 = mk_repo(storage3, false).await?; + let repo4 = mk_repo(storage4, false).await?; + + let mut set = JoinSet::new(); + #[allow(clippy::erasing_op, clippy::identity_op)] + { + let size2 = SIZE as u32; + let size24 = size2 / 4; + let xrange1 = size24 * 0..size24 * 1; + let xrange2 = size24 * 1..size24 * 2; + let xrange3 = size24 * 2..size24 * 3; + let xrange4 = size24 * 3..size24 * 4; + set.spawn(async move { write_chunks(repo1, xrange1, 0..size2).await }); + set.spawn(async move { write_chunks(repo2, xrange2, 0..size2).await }); + set.spawn(async move { write_chunks(repo3, xrange3, 0..size2).await }); + set.spawn(async move { write_chunks(repo4, xrange4, 0..size2).await }); + } + + let mut write_results = set.join_all().await; + + // We have completed all the chunk writes + assert!(write_results.len() == 4); + assert!(write_results.iter().all(|r| r.is_ok())); + + // We recover our repo instances (the may be numbered in a different order, doesn't matter) + let mut repo1 = write_results.pop().unwrap().unwrap(); + let repo2 = write_results.pop().unwrap().unwrap(); + let repo3 = write_results.pop().unwrap().unwrap(); + let repo4 = write_results.pop().unwrap().unwrap(); + + // We get the ChangeSet from repos 2, 3 and 4, by converting them into bytes. + // This simulates a marshalling operation from a remote writer. + let change_sets: Vec = vec![repo2.into(), repo3.into(), repo4.into()]; + let change_sets_bytes = change_sets.iter().map(|cs| cs.export_to_bytes().unwrap()); + let change_sets = change_sets_bytes + .map(|bytes| ChangeSet::import_from_bytes(bytes.as_slice()).unwrap()); + + // Distributed commit now, using arbitrarily one of the repos as base and the others as extra + // changesets + let _new_snapshot = + repo1.distributed_commit("main", change_sets, "distributed commit", None).await?; + + // We check we can read all chunks correctly + verify(repo1).await?; + + // To be safe, we create a new instance of the storage and repo, and verify again + let storage = mk_storage(prefix.as_str()).await?; + let repo = mk_repo(storage, false).await?; + verify(repo).await?; + + Ok(()) +} diff --git a/icechunk/tests/test_s3_storage.rs b/icechunk/tests/test_s3_storage.rs new file mode 100644 index 00000000..0d35291d --- /dev/null +++ b/icechunk/tests/test_s3_storage.rs @@ -0,0 +1,190 @@ +use std::{collections::HashSet, sync::Arc}; + +use bytes::Bytes; +use chrono::Utc; +use icechunk::{ + format::{ + manifest::Manifest, snapshot::Snapshot, ByteRange, ChunkId, ManifestId, + SnapshotId, + }, + refs::{ + create_tag, fetch_branch_tip, fetch_tag, list_refs, update_branch, Ref, RefError, + }, + storage::{ + s3::{S3Config, S3Credentials, S3Storage, StaticS3Credentials}, + StorageResult, + }, + Storage, +}; +use pretty_assertions::assert_eq; + +async fn mk_storage() -> StorageResult { + S3Storage::new_s3_store( + "testbucket", + "test_s3_storage__".to_string() + Utc::now().to_rfc3339().as_str(), + Some(&S3Config { + region: Some("us-east-1".to_string()), + endpoint: Some("http://localhost:9000".to_string()), + credentials: S3Credentials::Static(StaticS3Credentials { + access_key_id: "minio123".into(), + secret_access_key: "minio123".into(), + session_token: None, + }), + allow_http: true, + }), + ) + .await +} + +#[tokio::test] +pub async fn test_snapshot_write_read() -> Result<(), Box> { + let storage = mk_storage().await?; + let id = SnapshotId::random(); + let snapshot = Arc::new(Snapshot::empty()); + storage.write_snapshot(id.clone(), snapshot.clone()).await?; + let back = storage.fetch_snapshot(&id).await?; + assert_eq!(snapshot, back); + Ok(()) +} + +#[tokio::test] +pub async fn test_manifest_write_read() -> Result<(), Box> { + let storage = mk_storage().await?; + let id = ManifestId::random(); + let manifest = Arc::new(Manifest::default()); + storage.write_manifests(id.clone(), manifest.clone()).await?; + let back = storage.fetch_manifests(&id).await?; + assert_eq!(manifest, back); + Ok(()) +} + +#[tokio::test] +pub async fn test_chunk_write_read() -> Result<(), Box> { + let storage = mk_storage().await?; + let id = ChunkId::random(); + let bytes = Bytes::from_static(b"hello"); + storage.write_chunk(id.clone(), bytes.clone()).await?; + let back = storage.fetch_chunk(&id, &ByteRange::ALL).await?; + assert_eq!(bytes, back); + + let back = + storage.fetch_chunk(&id, &ByteRange::from_offset_with_length(1, 2)).await?; + assert_eq!(Bytes::from_static(b"el"), back); + + let back = storage.fetch_chunk(&id, &ByteRange::from_offset(1)).await?; + assert_eq!(Bytes::from_static(b"ello"), back); + + let back = storage.fetch_chunk(&id, &ByteRange::to_offset(3)).await?; + assert_eq!(Bytes::from_static(b"hel"), back); // codespell:ignore + + let back = storage.fetch_chunk(&id, &ByteRange::bounded(1, 4)).await?; + assert_eq!(Bytes::from_static(b"ell"), back); + Ok(()) +} + +#[tokio::test] +pub async fn test_tag_write_get() -> Result<(), Box> { + let storage = mk_storage().await?; + let id = SnapshotId::random(); + create_tag(&storage, "mytag", id.clone(), false).await?; + let back = fetch_tag(&storage, "mytag").await?; + assert_eq!(id, back.snapshot); + Ok(()) +} + +#[tokio::test] +pub async fn test_fetch_non_existing_tag() -> Result<(), Box> { + let storage = mk_storage().await?; + let id = SnapshotId::random(); + create_tag(&storage, "mytag", id.clone(), false).await?; + + let back = fetch_tag(&storage, "non-existing-tag").await; + assert!(matches!(back, Err(RefError::RefNotFound(r)) if r == "non-existing-tag")); + Ok(()) +} + +#[tokio::test] +pub async fn test_create_existing_tag() -> Result<(), Box> { + let storage = mk_storage().await?; + let id = SnapshotId::random(); + create_tag(&storage, "mytag", id.clone(), false).await?; + + let res = create_tag(&storage, "mytag", id.clone(), false).await; + assert!(matches!(res, Err(RefError::TagAlreadyExists(r)) if r == "mytag")); + Ok(()) +} + +#[tokio::test] +pub async fn test_branch_initialization() -> Result<(), Box> { + let storage = mk_storage().await?; + let id = SnapshotId::random(); + + let res = update_branch(&storage, "some-branch", id.clone(), None, false).await?; + assert_eq!(res.0, 0); + + let res = fetch_branch_tip(&storage, "some-branch").await?; + assert_eq!(res.snapshot, id); + + Ok(()) +} + +#[tokio::test] +pub async fn test_fetch_non_existing_branch() -> Result<(), Box> { + let storage = mk_storage().await?; + let id = SnapshotId::random(); + update_branch(&storage, "some-branch", id.clone(), None, false).await?; + + let back = fetch_branch_tip(&storage, "non-existing-branch").await; + assert!(matches!(back, Err(RefError::RefNotFound(r)) if r == "non-existing-branch")); + Ok(()) +} + +#[tokio::test] +pub async fn test_branch_update() -> Result<(), Box> { + let storage = mk_storage().await?; + let id1 = SnapshotId::random(); + let id2 = SnapshotId::random(); + let id3 = SnapshotId::random(); + + let res = update_branch(&storage, "some-branch", id1.clone(), None, false).await?; + assert_eq!(res.0, 0); + + let res = + update_branch(&storage, "some-branch", id2.clone(), Some(&id1), false).await?; + assert_eq!(res.0, 1); + + let res = + update_branch(&storage, "some-branch", id3.clone(), Some(&id2), false).await?; + assert_eq!(res.0, 2); + + let res = fetch_branch_tip(&storage, "some-branch").await?; + assert_eq!(res.snapshot, id3); + + Ok(()) +} + +#[tokio::test] +pub async fn test_ref_names() -> Result<(), Box> { + let storage = mk_storage().await?; + let id1 = SnapshotId::random(); + let id2 = SnapshotId::random(); + update_branch(&storage, "main", id1.clone(), None, false).await?; + update_branch(&storage, "main", id2.clone(), Some(&id1), false).await?; + update_branch(&storage, "foo", id1.clone(), None, false).await?; + update_branch(&storage, "bar", id1.clone(), None, false).await?; + create_tag(&storage, "my-tag", id1.clone(), false).await?; + create_tag(&storage, "my-other-tag", id1.clone(), false).await?; + + let res: HashSet<_> = HashSet::from_iter(list_refs(&storage).await?); + assert_eq!( + res, + HashSet::from_iter([ + Ref::Tag("my-tag".to_string()), + Ref::Tag("my-other-tag".to_string()), + Ref::Branch("main".to_string()), + Ref::Branch("foo".to_string()), + Ref::Branch("bar".to_string()), + ]) + ); + Ok(()) +} diff --git a/icechunk/tests/test_virtual_refs.rs b/icechunk/tests/test_virtual_refs.rs index 6d18d0e2..13bbb418 100644 --- a/icechunk/tests/test_virtual_refs.rs +++ b/icechunk/tests/test_virtual_refs.rs @@ -4,56 +4,70 @@ mod tests { use icechunk::{ format::{ manifest::{VirtualChunkLocation, VirtualChunkRef}, - ByteRange, ChunkId, ChunkIndices, + ByteRange, ChunkId, ChunkIndices, Path, }, metadata::{ChunkKeyEncoding, ChunkShape, DataType, FillValue}, repository::{get_chunk, ChunkPayload, ZarrArrayMetadata}, - storage::{object_store::S3Credentials, ObjectStorage}, + storage::{ + s3::{mk_client, S3Config, S3Credentials, S3Storage, StaticS3Credentials}, + virtual_ref::ObjectStoreVirtualChunkResolverConfig, + ObjectStorage, + }, zarr::AccessMode, Repository, Storage, Store, }; - use std::sync::Arc; - use std::{error::Error, num::NonZeroU64, path::PathBuf}; + use std::{error::Error, num::NonZeroU64}; + use std::{path::Path as StdPath, sync::Arc}; + use tempfile::TempDir; use bytes::Bytes; - use object_store::{ObjectStore, PutMode, PutOptions, PutPayload}; + use object_store::{ + local::LocalFileSystem, ObjectStore, PutMode, PutOptions, PutPayload, + }; use pretty_assertions::assert_eq; - async fn create_minio_repository() -> Repository { - let storage: Arc = Arc::new( - ObjectStorage::new_s3_store( - "testbucket".to_string(), - format!("{:?}", ChunkId::random()), - Some(S3Credentials { - access_key_id: "minio123".into(), - secret_access_key: "minio123".into(), - session_token: None, - }), - Some("http://localhost:9000"), - ) - .expect("Creating minio storage failed"), - ); - Repository::init(Arc::clone(&storage), true) + fn minino_s3_config() -> S3Config { + S3Config { + region: Some("us-east-1".to_string()), + endpoint: Some("http://localhost:9000".to_string()), + credentials: S3Credentials::Static(StaticS3Credentials { + access_key_id: "minio123".into(), + secret_access_key: "minio123".into(), + session_token: None, + }), + allow_http: true, + } + } + + fn anon_s3_config() -> S3Config { + S3Config { + region: Some("us-east-1".to_string()), + endpoint: None, + credentials: S3Credentials::Anonymous, + allow_http: false, + } + } + + async fn create_repository( + storage: Arc, + virtual_s3_config: S3Config, + ) -> Repository { + Repository::init(storage, true) .await .expect("building repository failed") + .with_virtual_ref_config(ObjectStoreVirtualChunkResolverConfig::S3( + virtual_s3_config, + )) .build() } - async fn write_chunks_to_minio(chunks: impl Iterator) { - use object_store::aws::AmazonS3Builder; - let bucket_name = "testbucket".to_string(); + async fn write_chunks_to_store( + store: impl ObjectStore, + chunks: impl Iterator, + ) { // TODO: Switch to PutMode::Create when object_store supports that let opts = PutOptions { mode: PutMode::Overwrite, ..PutOptions::default() }; - let store = AmazonS3Builder::new() - .with_access_key_id("minio123") - .with_secret_access_key("minio123") - .with_endpoint("http://localhost:9000") - .with_allow_http(true) - .with_bucket_name(bucket_name) - .build() - .expect("building S3 store failed"); - for (path, bytes) in chunks { store .put_opts( @@ -65,9 +79,167 @@ mod tests { .expect(&format!("putting chunk to {} failed", &path)); } } + async fn create_local_repository( + path: &StdPath, + virtual_s3_config: S3Config, + ) -> Repository { + let storage: Arc = Arc::new( + ObjectStorage::new_local_store(path).expect("Creating local storage failed"), + ); + + create_repository(storage, virtual_s3_config).await + } + + async fn create_minio_repository() -> Repository { + let storage: Arc = Arc::new( + S3Storage::new_s3_store( + "testbucket".to_string(), + format!("{:?}", ChunkId::random()), + Some(&minino_s3_config()), + ) + .await + .expect("Creating minio storage failed"), + ); + + create_repository(storage, minino_s3_config()).await + } + + async fn write_chunks_to_local_fs(chunks: impl Iterator) { + let store = + LocalFileSystem::new_with_prefix("/").expect("Failed to create local store"); + write_chunks_to_store(store, chunks).await; + } + + async fn write_chunks_to_minio(chunks: impl Iterator) { + let client = mk_client(Some(&minino_s3_config())).await; + + let bucket_name = "testbucket".to_string(); + for (key, bytes) in chunks { + client + .put_object() + .bucket(bucket_name.clone()) + .key(key) + .body(bytes.into()) + .send() + .await + .unwrap(); + } + } + + #[tokio::test(flavor = "multi_thread")] + async fn test_repository_with_local_virtual_refs() -> Result<(), Box> { + let chunk_dir = TempDir::new()?; + let chunk_1 = chunk_dir.path().join("chunk-1").to_str().unwrap().to_owned(); + let chunk_2 = chunk_dir.path().join("chunk-2").to_str().unwrap().to_owned(); + + let bytes1 = Bytes::copy_from_slice(b"first"); + let bytes2 = Bytes::copy_from_slice(b"second0000"); + let chunks = [(chunk_1, bytes1.clone()), (chunk_2, bytes2.clone())]; + write_chunks_to_local_fs(chunks.iter().cloned()).await; + + let repo_dir = TempDir::new()?; + let mut ds = create_local_repository(repo_dir.path(), anon_s3_config()).await; + + let zarr_meta = ZarrArrayMetadata { + shape: vec![1, 1, 2], + data_type: DataType::Int32, + chunk_shape: ChunkShape(vec![NonZeroU64::new(2).unwrap()]), + chunk_key_encoding: ChunkKeyEncoding::Slash, + fill_value: FillValue::Int32(0), + codecs: vec![], + storage_transformers: None, + dimension_names: None, + }; + let payload1 = ChunkPayload::Virtual(VirtualChunkRef { + location: VirtualChunkLocation::from_absolute_path(&format!( + // intentional extra '/' + "file://{}", + chunks[0].0 + ))?, + offset: 0, + length: 5, + }); + let payload2 = ChunkPayload::Virtual(VirtualChunkRef { + location: VirtualChunkLocation::from_absolute_path(&format!( + "file://{}", + chunks[1].0, + ))?, + offset: 1, + length: 5, + }); + + let new_array_path: Path = "/array".try_into().unwrap(); + ds.add_array(new_array_path.clone(), zarr_meta.clone()).await.unwrap(); + + ds.set_chunk_ref( + new_array_path.clone(), + ChunkIndices(vec![0, 0, 0]), + Some(payload1), + ) + .await + .unwrap(); + ds.set_chunk_ref( + new_array_path.clone(), + ChunkIndices(vec![0, 0, 1]), + Some(payload2), + ) + .await + .unwrap(); + + assert_eq!( + get_chunk( + ds.get_chunk_reader( + &new_array_path, + &ChunkIndices(vec![0, 0, 0]), + &ByteRange::ALL + ) + .await + .unwrap() + ) + .await + .unwrap(), + Some(bytes1.clone()), + ); + assert_eq!( + get_chunk( + ds.get_chunk_reader( + &new_array_path, + &ChunkIndices(vec![0, 0, 1]), + &ByteRange::ALL + ) + .await + .unwrap() + ) + .await + .unwrap(), + Some(Bytes::copy_from_slice(&bytes2[1..6])), + ); + + for range in [ + ByteRange::bounded(0u64, 3u64), + ByteRange::from_offset(2u64), + ByteRange::to_offset(4u64), + ] { + assert_eq!( + get_chunk( + ds.get_chunk_reader( + &new_array_path, + &ChunkIndices(vec![0, 0, 0]), + &range + ) + .await + .unwrap() + ) + .await + .unwrap(), + Some(range.slice(bytes1.clone())) + ); + } + Ok(()) + } #[tokio::test(flavor = "multi_thread")] - async fn test_repository_with_virtual_refs() -> Result<(), Box> { + async fn test_repository_with_minio_virtual_refs() -> Result<(), Box> { let bytes1 = Bytes::copy_from_slice(b"first"); let bytes2 = Bytes::copy_from_slice(b"second0000"); let chunks = [ @@ -106,7 +278,7 @@ mod tests { length: 5, }); - let new_array_path: PathBuf = "/array".to_string().into(); + let new_array_path: Path = "/array".try_into().unwrap(); ds.add_array(new_array_path.clone(), zarr_meta.clone()).await.unwrap(); ds.set_chunk_ref( @@ -177,7 +349,7 @@ mod tests { } #[tokio::test] - async fn test_zarr_store_virtual_refs_set_and_get( + async fn test_zarr_store_virtual_refs_minio_set_and_get( ) -> Result<(), Box> { let bytes1 = Bytes::copy_from_slice(b"first"); let bytes2 = Bytes::copy_from_slice(b"second0000"); @@ -235,4 +407,50 @@ mod tests { ); Ok(()) } + + #[tokio::test] + async fn test_zarr_store_virtual_refs_from_public_s3( + ) -> Result<(), Box> { + let repo_dir = TempDir::new()?; + let ds = create_local_repository(repo_dir.path(), anon_s3_config()).await; + + let mut store = Store::from_repository( + ds, + AccessMode::ReadWrite, + Some("main".to_string()), + None, + ); + + store + .set( + "zarr.json", + Bytes::copy_from_slice(br#"{"zarr_format":3, "node_type":"group"}"#), + ) + .await + .unwrap(); + + let zarr_meta = Bytes::copy_from_slice(br#"{"zarr_format":3,"node_type":"array","attributes":{"foo":42},"shape":[10],"data_type":"float64","chunk_grid":{"name":"regular","configuration":{"chunk_shape":[10]}},"chunk_key_encoding":{"name":"default","configuration":{"separator":"/"}},"fill_value": 0.0,"codecs":[{"name":"mycodec","configuration":{"foo":42}}],"storage_transformers":[],"dimension_names":["depth"]}"#); + store.set("depth/zarr.json", zarr_meta.clone()).await.unwrap(); + + let ref2 = VirtualChunkRef { + location: VirtualChunkLocation::from_absolute_path( + "s3://noaa-nos-ofs-pds/dbofs/netcdf/202410/dbofs.t00z.20241009.fields.f030.nc", + )?, + offset: 119339, + length: 80, + }; + + store.set_virtual_ref("depth/c/0", ref2).await?; + + let chunk = store.get("depth/c/0", &ByteRange::ALL).await.unwrap(); + assert_eq!(chunk.len(), 80); + + let second_depth = f64::from_le_bytes(chunk[8..16].try_into().unwrap()); + assert!(second_depth - -0.85 < 0.000001); + + let last_depth = f64::from_le_bytes(chunk[(80 - 8)..].try_into().unwrap()); + assert!(last_depth - -0.05 < 0.000001); + + Ok(()) + } } diff --git a/icechunk_logo.png b/icechunk_logo.png deleted file mode 100644 index 349d2e46..00000000 Binary files a/icechunk_logo.png and /dev/null differ diff --git a/shell.nix b/shell.nix index 40ae0d4c..65e895a8 100644 --- a/shell.nix +++ b/shell.nix @@ -1,6 +1,6 @@ let - # Pinned nixpkgs, deterministic. Last updated to nixpkgs-unstable as of: 2024-07-23 - pkgs = import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/68c9ed8bbed9dfce253cc91560bf9043297ef2fe.tar.gz") {}; + # Pinned nixpkgs, deterministic. Last updated to nixos-unstable as of: 2024-10-06 + pkgs = import (fetchTarball "https://github.com/NixOS/nixpkgs/archive/7d49afd36b5590f023ec56809c02e05d8164fbc4.tar.gz") {}; # Rolling updates, not deterministic. # pkgs = import (fetchTarball("channel:nixpkgs-unstable")) {}; diff --git a/spec/icechunk_spec.md b/spec/icechunk_spec.md deleted file mode 100644 index e6b23856..00000000 --- a/spec/icechunk_spec.md +++ /dev/null @@ -1,1093 +0,0 @@ -# Icechunk Specification - -The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to be interpreted as described in [RFC 2119](https://www.rfc-editor.org/rfc/rfc2119.html). - -## Introduction - -The Icechunk specification is a storage specification for [Zarr](https://zarr-specs.readthedocs.io/en/latest/specs.html) data. -Icechunk is inspired by Apache Iceberg and borrows many concepts and ideas from the [Iceberg Spec](https://iceberg.apache.org/spec/#version-2-row-level-deletes). - -This specification describes a single Icechunk **repository**. -A repository is defined as a Zarr store containing one or more Arrays and Groups. -The most common scenario is for a repository to contain a single Zarr group with multiple arrays, each corresponding to different physical variables but sharing common spatiotemporal coordinates. -However, formally a repository can be any valid Zarr hierarchy, from a single Array to a deeply nested structure of Groups and Arrays. -Users of Icechunk should aim to scope their repository only to related arrays and groups that require consistent transactional updates. - -Icechunk defines a series of interconnected metadata and data files that together comprise the format. -All the data and metadata for a repository are stored in a directory in object storage or file storage. - -## Goals - -The goals of the specification are as follows: - -1. **Object storage** - the format is designed around the consistency features and performance characteristics available in modern cloud object storage. No external database or catalog is required. -1. **Serializable isolation** - Reads will be isolated from concurrent writes and always use a committed snapshot of a repository. Writes to repositories will be committed atomically and will not be partially visible. Readers will not acquire locks. -1. **Time travel** - Previous snapshots of a repository remain accessible after new ones have been written. -1. **Chunk sharding and references** - Chunk storage is decoupled from specific file names. Multiple chunks can be packed into a single object (sharding). Zarr-compatible chunks within other file formats (e.g. HDF5, NetCDF) can be referenced. -1. **Schema Evolution** - Arrays and Groups can be added, renamed, and removed from the hierarchy with minimal overhead. - -### Non Goals - -1. **Low Latency** - Icechunk is designed to support analytical workloads for large repositories. We accept that the extra layers of metadata files and indirection will introduce additional cold-start latency compared to regular Zarr. -1. **No Catalog** - The spec does not extend beyond a single repository or provide a way to organize multiple repositories into a hierarchy. -1. **Access Controls** - Access control is the responsibility of the storage medium. -The spec is not designed to enable fine-grained access restrictions (e.g. only read specific arrays) within a single repository. - -### Storage Operations - -Icechunk requires that the storage system support the following operations: - -- **In-place write** - Strong read-after-write and list-after-write consistency is expected. Files are not moved or altered once they are written. -- **Conditional write if-not-exists** - For the commit process to be safe and consistent, the storage system must guard against two files of the same name being created at the same time. -- **Seekable reads** - Chunk file formats may require seek support (e.g. shards). -- **Deletes** - Delete files that are no longer used (via a garbage-collection operation). -- **Sorted List** - The storage system must allow the listing of directories / prefixes in a consistent sorted order. - -These requirements are compatible with object stores, like S3, as well as with filesystems. - -The storage system is not required to support random-access writes. Once written, chunk and metadata files are immutable until they are deleted. - -## Specification - -### Overview - -Icechunk uses a series of linked metadata files to describe the state of the repository. - -- The **Snapshot file** records all of the different arrays and groups in the repository, plus their metadata. Every new commit creates a new snapshot file. The snapshot file contains pointers to one or more chunk manifest files and [optionally] attribute files. -- **Chunk manifests** store references to individual chunks. A single manifest may store references for multiple arrays or a subset of all the references for a single array. -- **Attributes files** provide a way to store additional user-defined attributes for arrays and groups outside of the structure file. This is important when the attributes are very large. -- **Chunk files** store the actual compressed chunk data, potentially containing data for multiple chunks in a single file. -- **Reference files** track the state of branches and tags, containing a lightweight pointer to a snapshot file. Transactions on a branch are committed by creating the next branch file in a sequence. - -When reading from store, the client opens the latest branch or tag file to obtain a pointer to the relevant snapshot file. -The client then reads the snapshot file to determine the structure and hierarchy of the repository. -When fetching data from an array, the client first examines the chunk manifest file[s] for that array and finally fetches the chunks referenced therein. - -When writing a new repository snapshot, the client first writes a new set of chunks and chunk manifests, and then generates a new snapshot file. -Finally, in an atomic put-if-not-exists operation, to commit the transaction, it creates the next branch file in the sequence. -This operation may fail if a different client has already committed the next snapshot. -In this case, the client may attempt to resolve the conflicts and retry the commit. - - -```mermaid -flowchart TD - subgraph metadata[Metadata] - subgraph reference_files[Reference Files] - old_branch[Main Branch File 001] - branch[Main Branch File 002] - end - subgraph snapshots[Snapshots] - snapshot1[Snapshot File 1] - snapshot2[Snapshot File 2] - end - subgraph attributes[Attributes] - attrs[Attribute File] - end - subgraph manifests[Manifests] - manifestA[Chunk Manifest A] - manifestB[Chunk Manifest B] - end - end - subgraph data - chunk1[Chunk File 1] - chunk2[Chunk File 2] - chunk3[Chunk File 3] - chunk4[Chunk File 4] - end - - branch -- snapshot ID --> snapshot2 - snapshot1 --> attrs - snapshot1 --> manifestA - snapshot2 --> attrs - snapshot2 -->manifestA - snapshot2 -->manifestB - manifestA --> chunk1 - manifestA --> chunk2 - manifestB --> chunk3 - manifestB --> chunk4 - -``` - -### File Layout - -All data and metadata files are stored within a root directory (typically a prefix within an object store) using the following directory structure. - -- `$ROOT` base URI (s3, gcs, local directory, etc.) -- `$ROOT/r/` reference files -- `$ROOT/s/` snapshot files -- `$ROOT/a/` attribute files -- `$ROOT/m/` chunk manifests -- `$ROOT/c/` chunks - -### File Formats - -> [!WARNING] -> The actual file formats used for each type of metadata file are in flux. The spec currently describes the data structures encoded in these files, rather than a specific file format. - - -### Reference Files - -Similar to Git, Icechunk supports the concept of _branches_ and _tags_. -These references point to a specific snapshot of the repository. - -- **Branches** are _mutable_ references to a snapshot. - Repositories may have one or more branches. - The default branch name is `main`. - Repositories must have a `main` branch. - After creation, branches may be updated to point to a different snapshot. -- **Tags** are _immutable_ references to a snapshot. - A repository may contain zero or more tags. - After creation, tags may never be updated, unlike in Git. - -References are very important in the Icechunk design. -Creating or updating references is the point at which consistency and atomicity of Icechunk transactions is enforced. -Different client sessions may simultaneously create two inconsistent snapshots; however, only one session may successfully update a reference to that snapshot. - -References (both branches and tags) are stored as JSON files with the following schema - -```json -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "RefData", - "type": "object", - "required": [ - "properties", - "snapshot", - "timestamp" - ], - "properties": { - "properties": { - "type": "object", - "additionalProperties": true - }, - "snapshot": { - "$ref": "#/definitions/ObjectId" - }, - "timestamp": { - "type": "string", - "format": "date-time" - } - }, - "definitions": { - "ObjectId": { - "description": "The id of a file in object store", - "type": "array", - "items": { - "type": "integer", - "format": "uint8", - "minimum": 0.0 - }, - "maxItems": 16, - "minItems": 16 - } - } -} -``` - -#### Creating and Updating Branches - -The process of creating and updating branches is designed to use the limited consistency guarantees offered by object storage to ensure transactional consistency. -When a client checks out a branch, it obtains a specific snapshot ID and uses this snapshot as the basis for any changes it creates during its session. -The client creates a new snapshot and then updates the branch reference to point to the new snapshot (a "commit"). -However, when updating the branch reference, the client must detect whether a _different session_ has updated the branch reference in the interim, possibly retrying or failing the commit if so. -This is an "optimistic concurrency" strategy; the resolution mechanism can be expensive, and conflicts are expected to be infrequent. - -The simplest way to do this would be to store the branch reference in a specific file (e.g. `main.json`) and update it via an atomic "compare and swap" operation. -Unfortunately not all popular object stores support this operation (AWS S3 notably does not). - -However, all popular object stores _do_ support a comparable operation: "create if not exists". -In other words, object stores can guard against the race condition which occurs when two sessions attempt to create the same file at the same time. -This motivates the design of Icechunk's branch file naming convention. - -Each commit to an Icechunk branch augments a counter called the _sequence number_. -The first commit creates sequence number 0. -The next commit creates sequence number 1. Etc. -This sequence number is encoded into the branch reference file name. - -When a client checks out a branch, it keeps track of its current sequence number _N_. -When it tries to commit, it attempts to create the file corresponding to sequence number _N + 1_ in an atomic "create if not exists" operation. -If this succeeds, the commit is successful. -If this fails (because another client created that file already), the commit fails. -At this point, the client may choose retry its commit (possibly re-reading the updated data) and then create sequence number _N + 2_. - -Branch references are stored in the `r/` directory within a subdirectory corresponding to the branch name: `r/$BRANCH_NAME/`. -Branch names may not contain the `/` character. - -To facilitate easy lookups of the latest branch reference, we use the following encoding for the sequence number. -- subtract the sequence number from the integer `1099511627775` -- encode the resulting integer as a string using [Base 32 Crockford](https://www.crockford.com/base32.html) -- left-padding the string with 0s to a length of 8 characters -This produces a deterministic sequence of branch file names in which the latest sequence always appears first when sorted lexicographically, facilitating easy lookup by listing the object store. - -The full branch file name is then given by `r/$BRANCH_NAME/$ENCODED_SEQUENCE.json`. - -For example, the first main branch file is in a store, corresponding with sequence number 0, is always named `r/main/ZZZZZZZZ.json`. -The branch file for sequence number 100 is `r/main/ZZZZZZWV.json`. -The maximum number of commits allowed in an Icechunk repository is consequently `1099511627775`, -corresponding to the state file `r/main/00000000.json`. - -#### Tags - -Since tags are immutable, they are simpler than branches. - -Tag files follow the pattern `r/$TAG_NAME.json`. - -When creating a new tag, the client attempts to create the tag file using a "create if not exists" operation. -If successful, the tag is created successful. -If not, that means another client has already created that tag. - -Tags cannot be deleted once created. - -### Snapshot Files - -The snapshot file fully describes the schema of the repository, including all arrays and groups. - -The snapshot file has the following JSON schema: - -
-JSON Schema for Snapshot File - -```json -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Snapshot", - "type": "object", - "required": [ - "metadata", - "nodes", - "properties", - "short_term_history", - "short_term_parents", - "started_at", - "total_parents" - ], - "properties": { - "metadata": { - "$ref": "#/definitions/SnapshotMetadata" - }, - "nodes": { - "type": "object", - "additionalProperties": { - "$ref": "#/definitions/NodeSnapshot" - } - }, - "properties": { - "type": "object", - "additionalProperties": true - }, - "short_term_history": { - "type": "array", - "items": { - "$ref": "#/definitions/SnapshotMetadata" - } - }, - "short_term_parents": { - "type": "integer", - "format": "uint16", - "minimum": 0.0 - }, - "started_at": { - "type": "string", - "format": "date-time" - }, - "total_parents": { - "type": "integer", - "format": "uint32", - "minimum": 0.0 - } - }, - "definitions": { - "ChunkIndices": { - "description": "An ND index to an element in a chunk grid.", - "type": "array", - "items": { - "type": "integer", - "format": "uint64", - "minimum": 0.0 - } - }, - "ChunkKeyEncoding": { - "type": "string", - "enum": [ - "Slash", - "Dot", - "Default" - ] - }, - "ChunkShape": { - "type": "array", - "items": { - "type": "integer", - "format": "uint64", - "minimum": 1.0 - } - }, - "Codec": { - "type": "object", - "required": [ - "name" - ], - "properties": { - "configuration": { - "type": [ - "object", - "null" - ], - "additionalProperties": true - }, - "name": { - "type": "string" - } - } - }, - "DataType": { - "oneOf": [ - { - "type": "string", - "enum": [ - "bool", - "int8", - "int16", - "int32", - "int64", - "uint8", - "uint16", - "uint32", - "uint64", - "float16", - "float32", - "float64", - "complex64", - "complex128" - ] - }, - { - "type": "object", - "required": [ - "rawbits" - ], - "properties": { - "rawbits": { - "type": "integer", - "format": "uint", - "minimum": 0.0 - } - }, - "additionalProperties": false - } - ] - }, - "FillValue": { - "oneOf": [ - { - "type": "object", - "required": [ - "Bool" - ], - "properties": { - "Bool": { - "type": "boolean" - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "Int8" - ], - "properties": { - "Int8": { - "type": "integer", - "format": "int8" - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "Int16" - ], - "properties": { - "Int16": { - "type": "integer", - "format": "int16" - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "Int32" - ], - "properties": { - "Int32": { - "type": "integer", - "format": "int32" - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "Int64" - ], - "properties": { - "Int64": { - "type": "integer", - "format": "int64" - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "UInt8" - ], - "properties": { - "UInt8": { - "type": "integer", - "format": "uint8", - "minimum": 0.0 - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "UInt16" - ], - "properties": { - "UInt16": { - "type": "integer", - "format": "uint16", - "minimum": 0.0 - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "UInt32" - ], - "properties": { - "UInt32": { - "type": "integer", - "format": "uint32", - "minimum": 0.0 - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "UInt64" - ], - "properties": { - "UInt64": { - "type": "integer", - "format": "uint64", - "minimum": 0.0 - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "Float16" - ], - "properties": { - "Float16": { - "type": "number", - "format": "float" - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "Float32" - ], - "properties": { - "Float32": { - "type": "number", - "format": "float" - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "Float64" - ], - "properties": { - "Float64": { - "type": "number", - "format": "double" - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "Complex64" - ], - "properties": { - "Complex64": { - "type": "array", - "items": [ - { - "type": "number", - "format": "float" - }, - { - "type": "number", - "format": "float" - } - ], - "maxItems": 2, - "minItems": 2 - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "Complex128" - ], - "properties": { - "Complex128": { - "type": "array", - "items": [ - { - "type": "number", - "format": "double" - }, - { - "type": "number", - "format": "double" - } - ], - "maxItems": 2, - "minItems": 2 - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "RawBits" - ], - "properties": { - "RawBits": { - "type": "array", - "items": { - "type": "integer", - "format": "uint8", - "minimum": 0.0 - } - } - }, - "additionalProperties": false - } - ] - }, - "Flags": { - "type": "array", - "items": [], - "maxItems": 0, - "minItems": 0 - }, - "ManifestExtents": { - "type": "array", - "items": { - "$ref": "#/definitions/ChunkIndices" - } - }, - "ManifestRef": { - "type": "object", - "required": [ - "extents", - "flags", - "object_id" - ], - "properties": { - "extents": { - "$ref": "#/definitions/ManifestExtents" - }, - "flags": { - "$ref": "#/definitions/Flags" - }, - "object_id": { - "$ref": "#/definitions/ObjectId" - } - } - }, - "NodeData": { - "oneOf": [ - { - "type": "string", - "enum": [ - "Group" - ] - }, - { - "type": "object", - "required": [ - "Array" - ], - "properties": { - "Array": { - "type": "array", - "items": [ - { - "$ref": "#/definitions/ZarrArrayMetadata" - }, - { - "type": "array", - "items": { - "$ref": "#/definitions/ManifestRef" - } - } - ], - "maxItems": 2, - "minItems": 2 - } - }, - "additionalProperties": false - } - ] - }, - "NodeSnapshot": { - "type": "object", - "required": [ - "id", - "node_data", - "path" - ], - "properties": { - "id": { - "type": "integer", - "format": "uint32", - "minimum": 0.0 - }, - "node_data": { - "$ref": "#/definitions/NodeData" - }, - "path": { - "type": "string" - }, - "user_attributes": { - "anyOf": [ - { - "$ref": "#/definitions/UserAttributesSnapshot" - }, - { - "type": "null" - } - ] - } - } - }, - "ObjectId": { - "description": "The id of a file in object store", - "type": "array", - "items": { - "type": "integer", - "format": "uint8", - "minimum": 0.0 - }, - "maxItems": 16, - "minItems": 16 - }, - "SnapshotMetadata": { - "type": "object", - "required": [ - "id", - "message", - "written_at" - ], - "properties": { - "id": { - "$ref": "#/definitions/ObjectId" - }, - "message": { - "type": "string" - }, - "written_at": { - "type": "string", - "format": "date-time" - } - } - }, - "StorageTransformer": { - "type": "object", - "required": [ - "name" - ], - "properties": { - "configuration": { - "type": [ - "object", - "null" - ], - "additionalProperties": true - }, - "name": { - "type": "string" - } - } - }, - "UserAttributes": { - "type": "object" - }, - "UserAttributesRef": { - "type": "object", - "required": [ - "flags", - "location", - "object_id" - ], - "properties": { - "flags": { - "$ref": "#/definitions/Flags" - }, - "location": { - "type": "integer", - "format": "uint32", - "minimum": 0.0 - }, - "object_id": { - "$ref": "#/definitions/ObjectId" - } - } - }, - "UserAttributesSnapshot": { - "oneOf": [ - { - "type": "object", - "required": [ - "Inline" - ], - "properties": { - "Inline": { - "$ref": "#/definitions/UserAttributes" - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "Ref" - ], - "properties": { - "Ref": { - "$ref": "#/definitions/UserAttributesRef" - } - }, - "additionalProperties": false - } - ] - }, - "ZarrArrayMetadata": { - "type": "object", - "required": [ - "chunk_key_encoding", - "chunk_shape", - "codecs", - "data_type", - "fill_value", - "shape" - ], - "properties": { - "chunk_key_encoding": { - "$ref": "#/definitions/ChunkKeyEncoding" - }, - "chunk_shape": { - "$ref": "#/definitions/ChunkShape" - }, - "codecs": { - "type": "array", - "items": { - "$ref": "#/definitions/Codec" - } - }, - "data_type": { - "$ref": "#/definitions/DataType" - }, - "dimension_names": { - "type": [ - "array", - "null" - ], - "items": { - "type": [ - "string", - "null" - ] - } - }, - "fill_value": { - "$ref": "#/definitions/FillValue" - }, - "shape": { - "type": "array", - "items": { - "type": "integer", - "format": "uint64", - "minimum": 0.0 - } - }, - "storage_transformers": { - "type": [ - "array", - "null" - ], - "items": { - "$ref": "#/definitions/StorageTransformer" - } - } - } - } - } -} -``` - -
- -### Attributes Files - -Attribute files hold user-defined attributes separately from the snapshot file. - -> [!WARNING] -> Attribute files have not been implemented. - -### Chunk Manifest Files - -A chunk manifest file stores chunk references. -Chunk references from multiple arrays can be stored in the same chunk manifest. -The chunks from a single array can also be spread across multiple manifests. - -
-JSON Schema for Chunk Manifest Files - -```json -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "title": "Manifest", - "type": "object", - "required": [ - "chunks" - ], - "properties": { - "chunks": { - "type": "object", - "additionalProperties": { - "$ref": "#/definitions/ChunkPayload" - } - } - }, - "definitions": { - "ChunkPayload": { - "oneOf": [ - { - "type": "object", - "required": [ - "Inline" - ], - "properties": { - "Inline": { - "type": "array", - "items": { - "type": "integer", - "format": "uint8", - "minimum": 0.0 - } - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "Virtual" - ], - "properties": { - "Virtual": { - "$ref": "#/definitions/VirtualChunkRef" - } - }, - "additionalProperties": false - }, - { - "type": "object", - "required": [ - "Ref" - ], - "properties": { - "Ref": { - "$ref": "#/definitions/ChunkRef" - } - }, - "additionalProperties": false - } - ] - }, - "ChunkRef": { - "type": "object", - "required": [ - "id", - "length", - "offset" - ], - "properties": { - "id": { - "$ref": "#/definitions/ObjectId" - }, - "length": { - "type": "integer", - "format": "uint64", - "minimum": 0.0 - }, - "offset": { - "type": "integer", - "format": "uint64", - "minimum": 0.0 - } - } - }, - "ObjectId": { - "description": "The id of a file in object store", - "type": "array", - "items": { - "type": "integer", - "format": "uint8", - "minimum": 0.0 - }, - "maxItems": 16, - "minItems": 16 - }, - "VirtualChunkRef": { - "type": "object", - "required": [ - "length", - "location", - "offset" - ], - "properties": { - "length": { - "type": "integer", - "format": "uint64", - "minimum": 0.0 - }, - "location": { - "type": "string" - }, - "offset": { - "type": "integer", - "format": "uint64", - "minimum": 0.0 - } - } - } - } -} -``` - -
- -### Chunk Files - -Chunk files contain the compressed binary chunks of a Zarr array. -Icechunk permits quite a bit of flexibility about how chunks are stored. -Chunk files can be: - -- One chunk per chunk file (i.e. standard Zarr) -- Multiple contiguous chunks from the same array in a single chunk file (similar to Zarr V3 shards) -- Chunks from multiple different arrays in the same file -- Other file types (e.g. NetCDF, HDF5) which contain Zarr-compatible chunks - -Applications may choose to arrange chunks within files in different ways to optimize I/O patterns. - -## Algorithms - -### Initialize New Repository - -A new repository is initialized by creating a new [possibly empty] snapshot file and then creating the first file in the main branch sequence. - -If another client attempts to initialize a repository in the same location, only one can succeed. - -### Read from Repository - -#### From Snapshot ID - -If the specific snapshot ID is known, a client can open it directly in read only mode. - -1. Use the specified shapshot ID to fetch the snapshot file. -1. Fetch desired attributes and values from arrays. - -#### From Branch - -Usually, a client will want to read from the latest branch (e.g. `main`). - -1. List the object store prefix `r/$BRANCH_NAME/` to obtain the latest branch file in the sequence. Due to the encoding of the sequence number, this should be the _first file_ in lexicographical order. -1. Read the branch file to obtain the snapshot ID. -1. Use the shapshot ID to fetch the snapshot file. -1. Fetch desired attributes and values from arrays. - -#### From Tag - -Opening a repository from a tag results in a read-only view. - -1. Read the tag file found at `r/$TAG_NAME.json` to obtain the snapshot ID. -1. Use the shapshot ID to fetch the snapshot file. -1. Fetch desired attributes and values from arrays. - -### Write New Snapshot - -Writing can only be done on a branch. - -1. Open a repository at a specific branch as described above, keeping track of the sequence number and branch name in the session context. -1. [optional] Write new chunk files. -1. [optional] Write new chunk manifests. -1. Write a new snapshot file. -1. Attempt to write the next branch file in the sequence - a. If successful, the commit succeeded and the branch is updated. - b. If unsuccessful, attempt to reconcile and retry the commit. - -### Create New Tag - -A tag can be created from any snapshot. - -1. Open the repository at a specific snapshot. -1. Attempt to create the tag file. - a. If successful, the tag was created. - b. If unsuccessful, the tag already exists. - -## Appendices - -### Comparison with Iceberg - -Like Iceberg, Icechunk uses a series of linked metadata files to describe the state of the repository. -But while Iceberg describes a table, the Icechunk repository is a Zarr store (hierarchical structure of Arrays and Groups.) - -| Iceberg Entity | Icechunk Entity | Comment | -|--|--|--| -| Table | Repository | The fundamental entity described by the spec | -| Column | Array | The logical container for a homogenous collection of values | -| Snapshot | Snapshot | A single committed snapshot of the repository | -| Catalog | N/A | There is no concept of a catalog in Icechunk. Consistency provided by object store. |