From f7730ac12e4530d104a29e458b89de5e08b6a86b Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 15 Nov 2024 10:40:44 -0800 Subject: [PATCH 1/8] experimental metadata docs --- site/metadata-beta.md | 299 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 299 insertions(+) create mode 100644 site/metadata-beta.md diff --git a/site/metadata-beta.md b/site/metadata-beta.md new file mode 100644 index 0000000..4ebed37 --- /dev/null +++ b/site/metadata-beta.md @@ -0,0 +1,299 @@ +# Experimental Metadata Filtering Builds + +The `sqlite-vec` project has a series of pull requests +([#122](https://github.com/asg017/sqlite-vec/pull/122), +[#123](https://github.com/asg017/sqlite-vec/pull/123), and +[#124](https://github.com/asg017/sqlite-vec/pull/124)) that will add proper +metadata column support to `vec0` virtual tables. + +But they aren't merged yet! So I've packaged pre-compiled extensions with these +features baked in, so that others can try it for themselves. Once those pull +requests are merged, this page will be removed. + +As a quick sample, this is what metadata columns will look like: + +```sql +create virtual table vec_movies using vec0( + -- aliased primary key + movie_id integer primary key, + + -- vector column + synopsis_embedding float[1024], + + -- partition key (internally shards vectors) + user_id integer primary key, + + -- metadata columns (indexed alongside vectors) + genre text, + num_reviews int, + mean_rating float, + + -- auxiliary columns (not indexed) + +synopsis text +); + +select + movie_id, + title, + genre, + num_reviews, + mean_rating, + distance +from vec_movies +where synopsis_embedding match '[...]' + and genre = 'scifi' + and num_reviews between 100 and 500 + and mean_rating > 3.5 + and k = 5; +/* +┌──────────┬─────────────────────┬─────────┬─────────────┬──────────────────┬──────────┐ +│ movie_id │ title │ genre │ num_reviews │ mean_rating │ distance │ +├──────────┼─────────────────────┼─────────┼─────────────┼──────────────────┼──────────┤ +│ 13 │ 'The Matrix' │ 'scifi' │ 423 │ 4.5 │ 2.5 │ +│ 18 │ 'Inception' │ 'scifi' │ 201 │ 5.0 │ 2.5 │ +│ 21 │ 'Gravity' │ 'scifi' │ 342 │ 4.0 │ 5.5 │ +│ 22 │ 'Dune' │ 'scifi' │ 451 │ 4.40000009536743 │ 6.5 │ +│ 8 │ 'Blade Runner 2049' │ 'scifi' │ 301 │ 5.0 │ 7.5 │ +└──────────┴─────────────────────┴─────────┴─────────────┴──────────────────┴──────────┘ +``` + +## Install + +To try it out youself, download one of the following ZIP files that contain +pre-compiled SQLite extensions. You can manually load them into your +Python/JavaScript/Ruby/etc. projects to try things out. + +| Platform | Link | +| ------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| MacOS ARM | [`sqlite-vec-macos-aarch64-extension.zip`](https://fly.storage.tigris.dev/sqlite-vec-public-static/metadata-filtering-beta/v1-052ba4b/sqlite-vec-macos-aarch64-extension.zip) | +| MacOS x86_64 | [`sqlite-vec-macos-x86_64-extension.zip`](https://fly.storage.tigris.dev/sqlite-vec-public-static/metadata-filtering-beta/v1-052ba4b/sqlite-vec-macos-x86_64-extension.zip) | +| Linux ARM | [`sqlite-vec-linux-aarch64-extension.zip`](https://fly.storage.tigris.dev/sqlite-vec-public-static/metadata-filtering-beta/v1-052ba4b/sqlite-vec-linux-aarch64-extension.zip) | +| Linux x86_64 | [`sqlite-vec-linux-x86_64-extension.zip`](https://fly.storage.tigris.dev/sqlite-vec-public-static/metadata-filtering-beta/v1-052ba4b/sqlite-vec-linux-x86_64-extension.zip) | +| Windows x86_64 | [`sqlite-vec-windows-x86_64-extension.zip`](https://fly.storage.tigris.dev/sqlite-vec-public-static/metadata-filtering-beta/v1-052ba4b/sqlite-vec-windows-x86_64-extension.zip) | +| Cosmopolitan (`sqlite3` CLI with `sqlite-vec` baked in) | [`sqlite-vec-cosmopolitan.zip`](https://fly.storage.tigris.dev/sqlite-vec-public-static/metadata-filtering-beta/v1-052ba4b/sqlite-vec-cosmopolitan.zip) | + +To check which experimental version you are on, run `SELECT vec_version()`. The +most recent version is `v-metadata-experiment.01`. + +The rest of this document is documentation about how to use these new metadata, +auxiliary, and partition columns in these experimental builds. + +## Experimental Status + +This work isn't complete yet, so there are some subtle bugs and TODOs: + +- You cannot `UPDATE` a `PARTITION KEY` value yet. +- KNN queries with a `WHERE` constraint on a `TEXT` metadata column that's + longer than `12` characters will fail. +- `NULL` values are not allowed on metadata columns +- `PARTITION KEY` columns only support `=` operators currently, but `!=`, `<=`, `>=`, `<`, and `>` will operators will be supported. + +These will be fixed before the official release. + +## Metadata in `vec0` Virtual Tables + +There are three ways to store non-vector columns in `vec0` virtual tables: +metadata columns, partition keys, and auxiliary columns. Each options has their +own benefits and limitations. + +```sql +create virtual table vec_chunks using vec0( + document_id integer partition key, + contents_embedding float[768], + + -- partition key column, denoted by 'partition key' + user_id integer partition key, + + -- metadata column, appears as normal column definition + label text, + + -- auxiliary column, denoted by '+' + +contents text +); +``` + +A quick summary of each option: + +| Column Type | Description | Benefits | Limitations | +| ----------------- | ----------------------------------------------------------------------- | ---------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------- | +| Metadata columns | Stores boolean, integer, floating point, or text data alongside vectors | Can be included in the `WHERE` clause of a KNN query | Slower full scan, slightly inefficient with long strings (`> 12` characters) | +| Auxiliary columns | Stores any kind of data in a separate internal table | Eliminates need for an external `JOIN` | Cannot appear in the `WHERE` clause of a KNN query | +| Partition Key | Internally shards vector index on a given key | Make selective queries much faster | Can cause oversharding and slow KNN if not used carefully. Should be +100's of vectors per unique partition key value | + +### Metadata Columns + +Metadata columns are extra "regular" columns that you can include in a `vec0` +table definition. These columns will be indexed along with declared vector +columns, and allow you to include extra `WHERE` constraints during KNN queries. + +```sql +create virtual table vec_movies using vec0( + movie_id integer primary key, + synopsis_embedding float[1024], + genre text, + num_reviews int, + mean_rating float, + contains_violence boolean +); +``` + +In the `vec0` constructor, the `genre`, `num_reviews`, `mean_rating`, and +`contains_violence` columns are metadata columns, with their specified type. + +A sample KNN query on this table could look like: + +```sql +select * +from vec_movies +where synopsis_embedding match '[...]' + and k = 5 + and genre = 'scifi' + and num_reviews between 100 and 500 + and mean_rating > 3.5 + and contains_violence = false; +``` + +The first two conditions in the `WHERE` clause (`synopsis_embedding match` and +`k = 5`) denote that the query in a KNN query. The other conditions are metadata +constraints, that `sqlite-vec` will recognize and apply during the KNN +calculation. In other words, for the above query, a maximum of 5 rows would be +returned, all of which would fit under all the `WHERE` constraints for their +metadata column values. + +#### Metadata Column Declaration + +Metatadata columns are declared in the `vec0` constructor just like regular column definitions, with the column name first then the column type. + +Only the following column types are supported in metadata columns. All these +columns are strictly typed. + +- `TEXT` for text and strings +- `INTEGER` for 8-byte integers +- `FLOAT` for 8-byte floating-point numbers +- `BOOLEAN` for 1-bit `0` or `1` + +Other column types may be supported in the future. Column type names are case +insensitive. + +Additional column constraints like `UNIQUE` or `NOT NULL` are not supported. + +A maximum of 16 metadata columns can be declared in a `vec0` virtual table. + + +#### Supported operations + +Metadata column `WHERE` conditions in a KNN query will only work on the +following operators: + +- `=` Equals to +- `!=` Not equals to +- `>` Greater than +- `>=` Greater than or equal to +- `<` Less than +- `<=` Less than or equal to + +Using any other operator like `IS NULL`, `LIKE`, `GLOB`, `REGEXP`, or any scalar +function will result in an error or incorrect results. + +Boolean columns only support `=` and `!=` operators. + +### Partition Key Columns + +Partition key columns allow one to internally shard a vector indexed based on a given key. Any `=` constraint in a `WHERE` clause on a partition key column will + +For example, say you're performing vector search on a large dataset of documents. However, each document belongs to a user, and users can only search their own documents. It would be wasteful to perform a brute-force over all documents if you only care about 1 user at a time. So, you can partition the vector index based on user ID like so: + +```sql +create virtual table vec_documents using vec0( + document_id integer primary key, + user_id integer partition key, + contents_embedding float[1024] +) +``` + +Then during a KNN query, you can constrain results to a specific user in the `WHERE` clause like so: + +```sql +select + document_id, + user_id, + distance +from vec_documents +where contents_embedding match :query + and k = 20 + and user_id = 123; +``` + +`sqlite-vec` will recognize the `user_id = 123` constraint and pre-filter vectors during a KNN search. Vectors with the same partition key values are collocated together, so this is a fast operation. + +Another example: say you're performing vector search on a large dataset of news headlines of the past 100 years. However, in your application, most users only want to search a subset of articles based on when they were written, like "in the past ten years" or "during the obama administration." You can paritition based on published date like so: + +```sql +create virtual table vec_articles using vec0( + article_id integer primary key, + published_date text partition key, + headline_embedding float[1024] +); +``` + +And a KNN query: + +```sql +select + article_id, + published_date, + distance +from vec_articles +where headline_embedding match :query + and published_date between '2009-01-20' and '2017-01-20'; -- Obama administration +``` + +But be careful! over-using partition key columns can lead to over-sharding and slower KNN queries. As a rule of thumb, make sure that every unique partition key value has ~100's of vectors associated with it. In the above examples, make sure that every user has on the magnitude of dozens or hundreds of documents each, or that every article has dozens or hundreds of articles per day. If they don't and you're noticing slow queries, try a more broad partition key value, like `organization_id` or `published_month`. + +A maximum of 4 partition key columns can be declared in a `vec0` virtual table, but use caution if you find yourself using more than 1. Vectors are sharded along each unique combination, so over-sharding is more common with more partition key columns. + +### Auxiliary Columns + +Auxiliary columns store additional unindexed data separate from the internal vector index. They are meant for larger metadata that will never appear in a `WHERE` clause of a KNN query, eliminating the need for a separate `JOIN`. + +Auxiliary columns are denoted by a `+` prefix in their column definition, like so: + +```sql +create virtual table vec_chunks using vec0( + contents_embedding float[1024], + +contents text +); + +select + rowid, + contents, + distance +from vec_chunks +where contents_embedding match :query + and k = 10; +``` + +Here we store the text contents of each chunk in the `contents` auxiliary column. When we perform a KNN query, we can reference the `contents` column in the `SELECT` clause, to get the raw text contents of the most relevant chunks. + +A similar approach can be used for image embeddings: + +```sql +create virtual table vec_image_chunks using vec0( + image_embedding float[1024], + +image blob +); + +select + rowid, + contents, + distance +from vec_chunks +where contents_embedding match :query + and k = 10; +``` + +Here the `image` auxiliary column can store the raw image file in a large `BLOB` column. It can appear in the `SELECT` clause of the KNN query, to get the most relevant raw images. + +In general, auxiliary columns are good for large text, blobs, URLs, or other datatypes that won't be a part of a `WHERE` clause of a KNN query. If you column will often appear in a `SELECT` clause but not the `WHERE` clause, then auxiliary columns are a good fit. + +A maximum of 16 auxiliary columns can be declared in a `vec0` virtual table. From 04c6da4c628071dc9322d1a9786dc793837f3479 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 15 Nov 2024 10:43:44 -0800 Subject: [PATCH 2/8] update linux arm builds --- .github/workflows/release.yaml | 16 ++++++++++++++++ .github/workflows/test.yaml | 12 ------------ 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index e6057d0..a977b38 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -55,6 +55,18 @@ jobs: with: name: sqlite-vec-windows-x86_64-extension path: dist/* + build-linux-aarch64-extension: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - run: sudo apt-get install gcc-arm-linux-gnueabihf + - run: ./scripts/vendor.sh + - run: make sqlite-vec.h + - run: make CC=arm-linux-gnueabihf-gcc loadable static + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-linux-aarch64-extension + path: dist/* build-cosmopolitan: runs-on: macos-latest permissions: @@ -190,6 +202,10 @@ jobs: with: name: sqlite-vec-linux-x86_64-extension path: dist/linux-x86_64 + - uses: actions/download-artifact@v4 + with: + name: sqlite-vec-linux-aarch64-extension + path: dist/linux-aarch64 - uses: actions/download-artifact@v4 with: name: sqlite-vec-macos-x86_64-extension diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index a601521..abb8490 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -141,12 +141,7 @@ jobs: build-linux-aarch64-extension: runs-on: ubuntu-latest steps: - - uses: green-coding-solutions/eco-ci-energy-estimation@v4 - with: - task: start-measurement - uses: actions/checkout@v4 - with: - version: "latest" - run: sudo apt-get install gcc-arm-linux-gnueabihf - run: ./scripts/vendor.sh - run: make sqlite-vec.h @@ -155,13 +150,6 @@ jobs: with: name: sqlite-vec-linux-aarch64-extension path: dist/* - - uses: green-coding-solutions/eco-ci-energy-estimation@v4 - with: - task: get-measurement - label: "all" - - uses: green-coding-solutions/eco-ci-energy-estimation@v4 - with: - task: display-results build-wasm32-emscripten: runs-on: ubuntu-latest steps: From e412860897c5a03171733b2cab2e8e6fcc315c6d Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 15 Nov 2024 10:51:26 -0800 Subject: [PATCH 3/8] v0.1.4-alpha.3 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 8af76f5..8bffce1 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.4-alpha.2 \ No newline at end of file +0.1.4-alpha.3 \ No newline at end of file From 67f8ff8517815df78da322bd09b4b375226c2aed Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 15 Nov 2024 11:02:10 -0800 Subject: [PATCH 4/8] v0.1.4 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 8bffce1..446ba66 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.4-alpha.3 \ No newline at end of file +0.1.4 \ No newline at end of file From 9780f6d445f63389f319fe3defa5ec0611952397 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 15 Nov 2024 11:17:55 -0800 Subject: [PATCH 5/8] bump dist to fix linux arm builds --- .github/workflows/release.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index a977b38..dd88811 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -255,7 +255,7 @@ jobs: name: sqlite-vec-iossimulator-x86_64-extension path: dist/iossimulator-x86_64 - run: | - curl -L https://github.com/asg017/sqlite-dist/releases/download/v0.0.1-alpha.16/sqlite-dist-x86_64-unknown-linux-gnu.tar.xz \ + curl -L https://github.com/asg017/sqlite-dist/releases/download/v0.0.1-alpha.17/sqlite-dist-x86_64-unknown-linux-gnu.tar.xz \ | tar xfJ - --strip-components 1 - run: make sqlite-vec.h - run: ./sqlite-dist ./sqlite-dist.toml --input dist/ --output distx/ --version $(cat VERSION) From 5183ab4b345f39a526620812c19340d673a43696 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 15 Nov 2024 11:18:19 -0800 Subject: [PATCH 6/8] v0.1.5-alpha.1 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 446ba66..8a0b646 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.4 \ No newline at end of file +0.1.5-alpha.1 \ No newline at end of file From ee3654701f7b8efe4802ff1caed24514f43443dd Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Fri, 15 Nov 2024 11:22:50 -0800 Subject: [PATCH 7/8] v0.1.5 --- VERSION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VERSION b/VERSION index 8a0b646..def9a01 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.1.5-alpha.1 \ No newline at end of file +0.1.5 \ No newline at end of file From 6658624172af9b53abeaa5311b794d67e8b241fb Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Wed, 20 Nov 2024 00:02:04 -0800 Subject: [PATCH 8/8] PARTITION KEY support (#122) * initial pass at PARTITION KEY support. * unit tests * gha this PR branch * fixup tests * doc internal * fix tests, KNN/rowids in * define SQLITE_INDEX_CONSTRAINT_OFFSET * whoops * update tests, syrupy, use uv * un ignore pyproject.toml * dot * tests/ * type error? * win: .exe, update error name * try fix macos python, paren around expr? * win bash? * dbg :( * explicit error * op * dbg win * win ./tests/.venv/Scripts/python.exe * block UPDATEs on partition key values for now --- .github/workflows/test.yaml | 135 +-- .gitignore | 1 - ARCHITECTURE.md | 54 ++ Makefile | 7 +- TODO | 5 + sqlite-vec.c | 915 ++++++++++++++---- test.sql | 49 + tests/.python-version | 1 + tests/__snapshots__/test-partition-keys.ambr | 245 +++++ tests/conftest.py | 12 + tests/pyproject.toml | 9 + ...orrectness.py => skip.test-correctness.py} | 0 tests/test-loadable.py | 45 +- tests/test-partition-keys.py | 115 +++ tests/test-unit.c | 54 ++ tests/uv.lock | 120 +++ 16 files changed, 1522 insertions(+), 245 deletions(-) create mode 100644 ARCHITECTURE.md create mode 100644 TODO create mode 100644 test.sql create mode 100644 tests/.python-version create mode 100644 tests/__snapshots__/test-partition-keys.ambr create mode 100644 tests/conftest.py create mode 100644 tests/pyproject.toml rename tests/{test-correctness.py => skip.test-correctness.py} (100%) create mode 100644 tests/test-partition-keys.py create mode 100644 tests/test-unit.c create mode 100644 tests/uv.lock diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index abb8490..96da148 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -3,6 +3,7 @@ on: push: branches: - main + - partition-by permissions: contents: read jobs: @@ -10,16 +11,92 @@ jobs: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - uses: astral-sh/setup-uv@v3 with: - python-version: "3.12" + enable-cache: true - run: ./scripts/vendor.sh - run: make loadable static - - run: pip install pytest numpy; make test-loadable + - run: uv sync --directory tests + - run: make test-loadable python=./tests/.venv/bin/python - uses: actions/upload-artifact@v4 with: name: sqlite-vec-linux-x86_64-extension path: dist/* + build-macos-x86_64-extension: + runs-on: macos-12 + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + - run: uv python install 3.12 + - run: ./scripts/vendor.sh + - run: make loadable static + - run: uv sync --directory tests + - run: make test-loadable python=./tests/.venv/bin/python + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-macos-x86_64-extension + path: dist/* + build-macos-aarch64-extension: + runs-on: macos-14 + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + - run: ./scripts/vendor.sh + - run: make loadable static + - run: uv sync --directory tests + - run: make test-loadable python=./tests/.venv/bin/python + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-macos-aarch64-extension + path: dist/* + build-windows-x86_64-extension: + runs-on: windows-2019 + steps: + - uses: actions/checkout@v4 + - uses: ilammy/msvc-dev-cmd@v1 + - uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + - run: ./scripts/vendor.sh + shell: bash + - run: make sqlite-vec.h + - run: mkdir dist + - run: cl.exe /fPIC -shared /W4 /Ivendor/ /O2 /LD sqlite-vec.c -o dist/vec0.dll + - run: uv sync --directory tests + - run: make test-loadable python=./tests/.venv/Scripts/python.exe + shell: bash + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-windows-x86_64-extension + path: dist/* + build-linux-aarch64-extension: + runs-on: ubuntu-latest + steps: + - uses: green-coding-solutions/eco-ci-energy-estimation@v4 + with: + task: start-measurement + - uses: actions/checkout@v4 + with: + version: "latest" + - run: sudo apt-get install gcc-arm-linux-gnueabihf + - run: ./scripts/vendor.sh + - run: make sqlite-vec.h + - run: make CC=arm-linux-gnueabihf-gcc loadable static + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-linux-aarch64-extension + path: dist/* + - uses: green-coding-solutions/eco-ci-energy-estimation@v4 + with: + task: get-measurement + label: "all" + - uses: green-coding-solutions/eco-ci-energy-estimation@v4 + with: + task: display-results build-android-extensions: runs-on: ubuntu-latest strategy: @@ -98,58 +175,6 @@ jobs: with: name: sqlite-vec-${{ matrix.platforms.name }}-extension path: dist/* - build-macos-x86_64-extension: - runs-on: macos-12 - steps: - - uses: actions/checkout@v4 - - run: ./scripts/vendor.sh - - run: make loadable static - - run: /usr/local/opt/python@3/libexec/bin/python -m pip install --break-system-packages pytest numpy; make test-loadable python=/usr/local/opt/python@3/libexec/bin/python - - uses: actions/upload-artifact@v4 - with: - name: sqlite-vec-macos-x86_64-extension - path: dist/* - build-macos-aarch64-extension: - runs-on: macos-14 - steps: - - uses: actions/checkout@v4 - - run: ./scripts/vendor.sh - - run: make loadable static - - run: /opt/homebrew/opt/python3/libexec/bin/python -m pip install pytest numpy --break-system-packages; make test-loadable python=/opt/homebrew/opt/python3/libexec/bin/python - - uses: actions/upload-artifact@v4 - with: - name: sqlite-vec-macos-aarch64-extension - path: dist/* - build-windows-x86_64-extension: - runs-on: windows-2019 - steps: - - uses: actions/checkout@v4 - - uses: ilammy/msvc-dev-cmd@v1 - - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - run: ./scripts/vendor.sh - shell: bash - - run: make sqlite-vec.h - - run: mkdir dist - - run: cl.exe /fPIC -shared /W4 /Ivendor/ /O2 /LD sqlite-vec.c -o dist/vec0.dll - - run: pip install pytest numpy; make test-loadable - - uses: actions/upload-artifact@v4 - with: - name: sqlite-vec-windows-x86_64-extension - path: dist/* - build-linux-aarch64-extension: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - run: sudo apt-get install gcc-arm-linux-gnueabihf - - run: ./scripts/vendor.sh - - run: make sqlite-vec.h - - run: make CC=arm-linux-gnueabihf-gcc loadable static - - uses: actions/upload-artifact@v4 - with: - name: sqlite-vec-linux-aarch64-extension - path: dist/* build-wasm32-emscripten: runs-on: ubuntu-latest steps: diff --git a/.gitignore b/.gitignore index 38f9876..ef7a661 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,3 @@ sqlite-vec.h tmp/ poetry.lock -pyproject.toml \ No newline at end of file diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..8ac9501 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,54 @@ +## `vec0` + +### idxStr + +The `vec0` idxStr is a string composed of single "header" character and 0 or +more "blocks" of 4 characters each. + +The "header" charcter denotes the type of query plan, as determined by the +`enum vec0_query_plan` values. The current possible values are: + +| Name | Value | Description | +| -------------------------- | ----- | ---------------------------------------------------------------------- | +| `VEC0_QUERY_PLAN_FULLSCAN` | `'1'` | Perform a full-scan on all rows | +| `VEC0_QUERY_PLAN_POINT` | `'2'` | Perform a single-lookup point query for the provided rowid | +| `VEC0_QUERY_PLAN_KNN` | `'3'` | Perform a KNN-style query on the provided query vector and parameters. | + +Each 4-character "block" is associated with a corresponding value in `argv[]`. For example, the 1st block at byte offset `1-4` (inclusive) is the 1st block and is associated with `argv[1]`. The 2nd block at byte offset `5-8` (inclusive) is associated with `argv[2]` and so on. Each block describes what kind of value or filter the given `argv[i]` value is. + + +#### `VEC0_IDXSTR_KIND_KNN_MATCH` (`'{'`) + +`argv[i]` is the query vector of the KNN query. + +The remaining 3 characters of the block are `_` fillers. + +#### `VEC0_IDXSTR_KIND_KNN_K` (`'}'`) + +`argv[i]` is the limit/k value of the KNN query. + +The remaining 3 characters of the block are `_` fillers. + +#### `VEC0_IDXSTR_KIND_KNN_ROWID_IN` (`'['`) + +`argv[i]` is the optional `rowid in (...)` value, and must be handled with [`sqlite3_vtab_in_first()` / +`sqlite3_vtab_in_next()`](https://www.sqlite.org/c3ref/vtab_in_first.html). + +The remaining 3 characters of the block are `_` fillers. + +#### `VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT` (`']'`) + +`argv[i]` is a "constraint" on a specific partition key. + +The second character of the block denotes which partition key to filter on, using `A` to denote the first partition key column, `B` for the second, etc. It is encoded with `'A' + partition_idx` and can be decoded with `c - 'A'`. + +The third character of the block denotes which operator is used in the constraint. It will be one of the values of `enum vec0_partition_operator`, as only a subset of operations are supported on partition keys. + +The fourth character of the block is a `_` filler. + + +#### `VEC0_IDXSTR_KIND_POINT_ID` (`'!'`) + +`argv[i]` is the value of the rowid or id to match against for the point query. + +The remaining 3 characters of the block are `_` fillers. diff --git a/Makefile b/Makefile index 8eb6170..1496b7a 100644 --- a/Makefile +++ b/Makefile @@ -185,13 +185,16 @@ publish-release: # -k test_vec0_update test-loadable: loadable - $(PYTHON) -m pytest -vv -s -x tests/test-loadable.py + $(PYTHON) -m pytest -vv -s -x tests/test-*.py test-loadable-snapshot-update: loadable $(PYTHON) -m pytest -vv tests/test-loadable.py --snapshot-update test-loadable-watch: - watchexec -w sqlite-vec.c -w tests/test-loadable.py -w Makefile --clear -- make test-loadable + watchexec --exts c,py,Makefile --clear -- make test-loadable + +test-unit: + $(CC) tests/test-unit.c sqlite-vec.c -I./ -Ivendor -o $(prefix)/test-unit && $(prefix)/test-unit site-dev: npm --prefix site run dev diff --git a/TODO b/TODO new file mode 100644 index 0000000..a487ddd --- /dev/null +++ b/TODO @@ -0,0 +1,5 @@ +# partition + +- [ ] UPDATE on partition key values + - remove previous row from chunk, insert into new one? +- [ ] properly sqlite3_vtab_nochange / sqlite3_value_nochange handling diff --git a/sqlite-vec.c b/sqlite-vec.c index cb4901d..caa992e 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -105,6 +105,10 @@ typedef size_t usize; #define SQLITE_INDEX_CONSTRAINT_LIMIT 73 #endif +#ifndef SQLITE_INDEX_CONSTRAINT_OFFSET +#define SQLITE_INDEX_CONSTRAINT_OFFSET 74 +#endif + #define countof(x) (sizeof(x) / sizeof((x)[0])) #define min(a, b) (((a) <= (b)) ? (a) : (b)) @@ -1930,6 +1934,83 @@ int vec0_parse_table_option(const char *source, int source_length, } return SQLITE_ERROR; } +/** + * @brief Parse an argv[i] entry of a vec0 virtual table definition, and see if + * it's a PARTITION KEY definition. + * + * @param source: argv[i] source string + * @param source_length: length of the source string + * @param out_column_name: If it is a partition key, the output column name. Same lifetime + * as source, points to specific char * + * @param out_column_name_length: Length of out_column_name in bytes + * @param out_column_type: SQLITE_TEXT or SQLITE_INTEGER. + * @return int: SQLITE_EMPTY if not a PK, SQLITE_OK if it is. + */ +int vec0_parse_partition_key_definition(const char *source, int source_length, + char **out_column_name, + int *out_column_name_length, + int *out_column_type) { + struct Vec0Scanner scanner; + struct Vec0Token token; + char *column_name; + int column_name_length; + int column_type; + vec0_scanner_init(&scanner, source, source_length); + + // Check first token is identifier, will be the column name + int rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME && + token.token_type != TOKEN_TYPE_IDENTIFIER) { + return SQLITE_EMPTY; + } + + column_name = token.start; + column_name_length = token.end - token.start; + + // Check the next token matches "text" or "integer", as column type + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME && + token.token_type != TOKEN_TYPE_IDENTIFIER) { + return SQLITE_EMPTY; + } + if (sqlite3_strnicmp(token.start, "text", token.end - token.start) == 0) { + column_type = SQLITE_TEXT; + } else if (sqlite3_strnicmp(token.start, "int", token.end - token.start) == + 0 || + sqlite3_strnicmp(token.start, "integer", + token.end - token.start) == 0) { + column_type = SQLITE_INTEGER; + } else { + return SQLITE_EMPTY; + } + + // Check the next token is identifier and matches "partition" + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME && + token.token_type != TOKEN_TYPE_IDENTIFIER) { + return SQLITE_EMPTY; + } + if (sqlite3_strnicmp(token.start, "partition", token.end - token.start) != 0) { + return SQLITE_EMPTY; + } + + // Check the next token is identifier and matches "key" + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME && + token.token_type != TOKEN_TYPE_IDENTIFIER) { + return SQLITE_EMPTY; + } + if (sqlite3_strnicmp(token.start, "key", token.end - token.start) != 0) { + return SQLITE_EMPTY; + } + + *out_column_name = column_name; + *out_column_name_length = column_name_length; + *out_column_type = column_type; + + return SQLITE_OK; +} + /** * @brief Parse an argv[i] entry of a vec0 virtual table definition, and see if * it's a PRIMARY KEY definition. @@ -1942,7 +2023,7 @@ int vec0_parse_table_option(const char *source, int source_length, * @param out_column_type: SQLITE_TEXT or SQLITE_INTEGER. * @return int: SQLITE_EMPTY if not a PK, SQLITE_OK if it is. */ -int parse_primary_key_definition(const char *source, int source_length, +int vec0_parse_primary_key_definition(const char *source, int source_length, char **out_column_name, int *out_column_name_length, int *out_column_type) { @@ -2021,6 +2102,12 @@ struct VectorColumnDefinition { enum Vec0DistanceMetrics distance_metric; }; +struct Vec0PartitionColumnDefinition { + int type; + char * name; + int name_length; +}; + size_t vector_byte_size(enum VectorElementType element_type, size_t dimensions) { switch (element_type) { @@ -2048,7 +2135,7 @@ size_t vector_column_byte_size(struct VectorColumnDefinition column) { * @return int SQLITE_OK on success, SQLITE_EMPTY is it's not a vector column * definition, SQLITE_ERROR on error. */ -int parse_vector_column(const char *source, int source_length, +int vec0_parse_vector_column(const char *source, int source_length, struct VectorColumnDefinition *outColumn) { // parses a vector column definition like so: // "abc float[123]", "abc_123 bit[1234]", eetc. @@ -3128,7 +3215,7 @@ static sqlite3_module vec_npy_eachModule = { #pragma region vec0 virtual table #define VEC0_COLUMN_ID 0 -#define VEC0_COLUMN_VECTORN_START 1 +#define VEC0_COLUMN_USERN_START 1 #define VEC0_COLUMN_OFFSET_DISTANCE 1 #define VEC0_COLUMN_OFFSET_K 2 @@ -3178,9 +3265,20 @@ static sqlite3_module vec_npy_eachModule = { typedef struct vec0_vtab vec0_vtab; -#define VEC0_MAX_VECTOR_COLUMNS 16 +#define VEC0_MAX_VECTOR_COLUMNS 16 +#define VEC0_MAX_PARTITION_COLUMNS 4 #define SQLITE_VEC_VEC0_MAX_DIMENSIONS 8192 +typedef enum { + // vector column, ie "contents_embedding float[1024]" + SQLITE_VEC0_USER_COLUMN_KIND_VECTOR = 1, + + // partition key column, ie "user_id integer partition key" + SQLITE_VEC0_USER_COLUMN_KIND_PARTITION = 2, + + // TODO: metadata + metadata filters +} vec0_user_column_kind; + struct vec0_vtab { sqlite3_vtab base; @@ -3191,6 +3289,13 @@ struct vec0_vtab { // Will change the schema of the _rowids table, and insert/query logic. int pkIsText; + // number of defined vector columns. + int numVectorColumns; + + // number of defined PARTITION KEY columns. + int numPartitionColumns; + + // Name of the schema the table exists on. // Must be freed with sqlite3_free() char *schemaName; @@ -3207,6 +3312,13 @@ struct vec0_vtab { // Must be freed with sqlite3_free() char *shadowChunksName; + // contains enum vec0_user_column_kind values for up to + // numVectorColumns + numPartitionColumns entries + uint8_t user_column_kinds[VEC0_MAX_VECTOR_COLUMNS + VEC0_MAX_PARTITION_COLUMNS]; + + uint8_t user_column_idxs[VEC0_MAX_VECTOR_COLUMNS + VEC0_MAX_PARTITION_COLUMNS]; + + // Name of all the vector chunk shadow tables. // Ex '_vector_chunks00' // Only the first numVectorColumns entries will be available. @@ -3214,9 +3326,7 @@ struct vec0_vtab { char *shadowVectorChunksNames[VEC0_MAX_VECTOR_COLUMNS]; struct VectorColumnDefinition vector_columns[VEC0_MAX_VECTOR_COLUMNS]; - - // number of defined vector columns. - int numVectorColumns; + struct Vec0PartitionColumnDefinition paritition_columns[VEC0_MAX_PARTITION_COLUMNS]; int chunk_size; @@ -3321,6 +3431,10 @@ void vec0_free(vec0_vtab *p) { } } +int vec0_num_defined_user_columns(vec0_vtab *p) { + return p->numVectorColumns + p->numPartitionColumns; +} + /** * @brief Returns the index of the distance hidden column for the given vec0 * table. @@ -3329,7 +3443,7 @@ void vec0_free(vec0_vtab *p) { * @return int */ int vec0_column_distance_idx(vec0_vtab *p) { - return VEC0_COLUMN_VECTORN_START + (p->numVectorColumns - 1) + + return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) + VEC0_COLUMN_OFFSET_DISTANCE; } @@ -3340,7 +3454,7 @@ int vec0_column_distance_idx(vec0_vtab *p) { * @return int k column index */ int vec0_column_k_idx(vec0_vtab *p) { - return VEC0_COLUMN_VECTORN_START + (p->numVectorColumns - 1) + + return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) + VEC0_COLUMN_OFFSET_K; } @@ -3349,18 +3463,36 @@ int vec0_column_k_idx(vec0_vtab *p) { * 0 otherwise. */ int vec0_column_idx_is_vector(vec0_vtab *pVtab, int column_idx) { - return column_idx >= VEC0_COLUMN_VECTORN_START && - column_idx <= - (VEC0_COLUMN_VECTORN_START + pVtab->numVectorColumns - 1); + return column_idx >= VEC0_COLUMN_USERN_START && + column_idx <= (VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(pVtab) - 1) && + pVtab->user_column_kinds[column_idx - VEC0_COLUMN_USERN_START] == SQLITE_VEC0_USER_COLUMN_KIND_VECTOR; } /** - * Returns the vector index of the given vector column index. + * Returns the vector index of the given user column index. * ONLY call if validated with vec0_column_idx_is_vector before */ int vec0_column_idx_to_vector_idx(vec0_vtab *pVtab, int column_idx) { UNUSED_PARAMETER(pVtab); - return column_idx - VEC0_COLUMN_VECTORN_START; + return pVtab->user_column_idxs[column_idx - VEC0_COLUMN_USERN_START]; +} +/** + * Returns 1 if the given column-based index is a "partition key" column, + * 0 otherwise. + */ +int vec0_column_idx_is_partition(vec0_vtab *pVtab, int column_idx) { + return column_idx >= VEC0_COLUMN_USERN_START && + column_idx <= (VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(pVtab) - 1) && + pVtab->user_column_kinds[column_idx - VEC0_COLUMN_USERN_START] == SQLITE_VEC0_USER_COLUMN_KIND_PARTITION; +} + +/** + * Returns the partition column index of the given user column index. + * ONLY call if validated with vec0_column_idx_is_vector before + */ +int vec0_column_idx_to_partition_idx(vec0_vtab *pVtab, int column_idx) { + UNUSED_PARAMETER(pVtab); + return pVtab->user_column_idxs[column_idx - VEC0_COLUMN_USERN_START]; } /** @@ -3593,13 +3725,74 @@ int vec0_get_vector_data(vec0_vtab *pVtab, i64 rowid, int vector_column_idx, return rc; } -int vec0_get_latest_chunk_rowid(vec0_vtab *p, i64 *chunk_rowid) { +/** + * @brief Retrieve the sqlite3_value of the i'th partition value for the given row. + * + * @param pVtab - the vec0_vtab in questions + * @param rowid - rowid of target row + * @param partition_idx - which partition column to retrieve + * @param outValue - output sqlite3_value + * @return int - SQLITE_OK on success, otherwise error code + */ +int vec0_get_partition_value_for_rowid(vec0_vtab *pVtab, i64 rowid, int partition_idx, sqlite3_value ** outValue) { + int rc; + i64 chunk_id; + i64 chunk_offset; + rc = vec0_get_chunk_position(pVtab, rowid, NULL, &chunk_id, &chunk_offset); + if(rc != SQLITE_OK) { + return rc; + } + sqlite3_stmt * stmt = NULL; + char * zSql = sqlite3_mprintf("SELECT partition%02d FROM " VEC0_SHADOW_CHUNKS_NAME " WHERE chunk_id = ?", partition_idx, pVtab->schemaName, pVtab->tableName); + if(!zSql) { + return SQLITE_NOMEM; + } + rc = sqlite3_prepare_v2(pVtab->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if(rc != SQLITE_OK) { + return rc; + } + sqlite3_bind_int64(stmt, 1, chunk_id); + rc = sqlite3_step(stmt); + if(rc != SQLITE_ROW) { + rc = SQLITE_ERROR; + goto done; + } + *outValue = sqlite3_value_dup(sqlite3_column_value(stmt, 0)); + if(!*outValue) { + rc = SQLITE_NOMEM; + goto done; + } + rc = SQLITE_OK; + + done: + sqlite3_finalize(stmt); + return rc; + +} + +int vec0_get_latest_chunk_rowid(vec0_vtab *p, i64 *chunk_rowid, sqlite3_value ** partitionKeyValues) { int rc; const char *zSql; // lazy initialize stmtLatestChunk when needed. May be cleared during xSync() if (!p->stmtLatestChunk) { - zSql = sqlite3_mprintf("SELECT max(rowid) FROM " VEC0_SHADOW_CHUNKS_NAME, + if(p->numPartitionColumns > 0) { + sqlite3_str * s = sqlite3_str_new(NULL); + sqlite3_str_appendf(s, "SELECT max(rowid) FROM " VEC0_SHADOW_CHUNKS_NAME " WHERE ", + p->schemaName, p->tableName); + + for(int i = 0; i < p->numPartitionColumns; i++) { + if(i != 0) { + sqlite3_str_appendall(s, " AND "); + } + sqlite3_str_appendf(s, " partition%02d = ? ", i); + } + zSql = sqlite3_str_finish(s); + }else { + zSql = sqlite3_mprintf("SELECT max(rowid) FROM " VEC0_SHADOW_CHUNKS_NAME, p->schemaName, p->tableName); + } + if (!zSql) { rc = SQLITE_NOMEM; goto cleanup; @@ -3614,6 +3807,10 @@ int vec0_get_latest_chunk_rowid(vec0_vtab *p, i64 *chunk_rowid) { } } + for(int i = 0; i < p->numPartitionColumns; i++) { + sqlite3_bind_value(p->stmtLatestChunk, i+1, (partitionKeyValues[i])); + } + rc = sqlite3_step(p->stmtLatestChunk); if (rc != SQLITE_ROW) { // IMP: V31559_15629 @@ -3621,6 +3818,10 @@ int vec0_get_latest_chunk_rowid(vec0_vtab *p, i64 *chunk_rowid) { rc = SQLITE_ERROR; goto cleanup; } + if(sqlite3_column_type(p->stmtLatestChunk, 0) == SQLITE_NULL){ + rc = SQLITE_EMPTY; + goto cleanup; + } *chunk_rowid = sqlite3_column_int64(p->stmtLatestChunk, 0); rc = sqlite3_step(p->stmtLatestChunk); if (rc != SQLITE_DONE) { @@ -3636,6 +3837,7 @@ int vec0_get_latest_chunk_rowid(vec0_vtab *p, i64 *chunk_rowid) { cleanup: if (p->stmtLatestChunk) { sqlite3_reset(p->stmtLatestChunk); + sqlite3_clear_bindings(p->stmtLatestChunk); } return rc; } @@ -3825,21 +4027,39 @@ int vec0_rowids_update_position(vec0_vtab *p, i64 rowid, i64 chunk_rowid, * rowid to insert new blank rows into _vector_chunksXX tables. * * @param p: vec0 table to add new chunk - * @param chunk_rowid: Putput pointer, if not NULL, then will be filled with the + * @param paritionKeyValues: Array of partition key valeus for the new chunk, if available + * @param chunk_rowid: Output pointer, if not NULL, then will be filled with the * new chunk rowid. * @return int SQLITE_OK on success, error code otherwise. */ -int vec0_new_chunk(vec0_vtab *p, i64 *chunk_rowid) { +int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk_rowid) { int rc; char *zSql; sqlite3_stmt *stmt; i64 rowid; // Step 1: Insert a new row in _chunks, capture that new rowid - zSql = sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_CHUNKS_NAME + if(p->numPartitionColumns > 0) { + sqlite3_str * s = sqlite3_str_new(NULL); + sqlite3_str_appendf(s, "INSERT INTO " VEC0_SHADOW_CHUNKS_NAME, p->schemaName, p->tableName); + sqlite3_str_appendall(s, "(size, validity, rowids"); + for(int i = 0; i < p->numPartitionColumns; i++) { + sqlite3_str_appendf(s, ", partition%02d", i); + } + sqlite3_str_appendall(s, ") VALUES (?, ?, ?"); + for(int i = 0; i < p->numPartitionColumns; i++) { + sqlite3_str_appendall(s, ", ?"); + } + sqlite3_str_appendall(s, ")"); + + zSql = sqlite3_str_finish(s); + }else { + zSql = sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_CHUNKS_NAME "(size, validity, rowids) " "VALUES (?, ?, ?);", p->schemaName, p->tableName); + } + if (!zSql) { return SQLITE_NOMEM; } @@ -3860,6 +4080,10 @@ int vec0_new_chunk(vec0_vtab *p, i64 *chunk_rowid) { sqlite3_bind_zeroblob(stmt, 2, p->chunk_size / CHAR_BIT); // validity bitmap sqlite3_bind_zeroblob(stmt, 3, p->chunk_size * sizeof(i64)); // rowids + for(int i = 0; i < p->numPartitionColumns; i++) { + sqlite3_bind_value(stmt, 4 + i, partitionKeyValues[i]); + } + rc = sqlite3_step(stmt); int failed = rc != SQLITE_DONE; rowid = sqlite3_last_insert_rowid(p->db); @@ -3876,14 +4100,18 @@ int vec0_new_chunk(vec0_vtab *p, i64 *chunk_rowid) { // Step 2: Create new vector chunks for each vector column, with // that new chunk_rowid. - for (int i = 0; i < p->numVectorColumns; i++) { + for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { + if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_VECTOR) { + continue; + } + int vector_column_idx = p->user_column_idxs[i]; i64 vectorsSize = - p->chunk_size * vector_column_byte_size(p->vector_columns[i]); + p->chunk_size * vector_column_byte_size(p->vector_columns[vector_column_idx]); zSql = sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_VECTOR_N_NAME "(rowid, vectors)" "VALUES (?, ?)", - p->schemaName, p->tableName, i); + p->schemaName, p->tableName, vector_column_idx); if (!zSql) { return SQLITE_NOMEM; } @@ -3912,21 +4140,6 @@ int vec0_new_chunk(vec0_vtab *p, i64 *chunk_rowid) { return SQLITE_OK; } -// Possible query plans for xBestIndex on vec0 tables. -typedef enum { - // Full scan, every row is queried. - SQLITE_VEC0_QUERYPLAN_FULLSCAN, - - // A single row is queried by rowid/id - SQLITE_VEC0_QUERYPLAN_POINT, - - // A KNN-style query is made on a specific vector column. - // Requires - // 1) a MATCH/compatible distance contraint on a single vector column - // 2) either a 'LIMIT ?' or 'k=?' contraint - SQLITE_VEC0_QUERYPLAN_KNN, -} vec0_query_plan; - struct vec0_query_fullscan_data { sqlite3_stmt *rowids_stmt; i8 done; @@ -3979,6 +4192,14 @@ void vec0_query_point_data_clear(struct vec0_query_point_data *point_data) { } } +typedef enum { + // If any values are updated, please update the ARCHITECTURE.md docs accordingly! + + VEC0_QUERY_PLAN_FULLSCAN = '1', + VEC0_QUERY_PLAN_POINT = '2', + VEC0_QUERY_PLAN_KNN = '3', +} vec0_query_plan; + typedef struct vec0_cursor vec0_cursor; struct vec0_cursor { sqlite3_vtab_cursor base; @@ -4025,6 +4246,8 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, // option int chunk_size = -1; int numVectorColumns = 0; + int numPartitionColumns = 0; + int user_column_idx = 0; // track if a "primary key" column is defined char *pkColumnName = NULL; @@ -4032,8 +4255,14 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, int pkColumnType = SQLITE_INTEGER; for (int i = 3; i < argc; i++) { - struct VectorColumnDefinition c; - rc = parse_vector_column(argv[i], strlen(argv[i]), &c); + struct VectorColumnDefinition vecColumn; + struct Vec0PartitionColumnDefinition partitionColumn; + char *cName = NULL; + int cNameLength; + int cType; + + // Scenario #1: Constructor argument is a vector column definition, ie `foo float[1024]` + rc = vec0_parse_vector_column(argv[i], strlen(argv[i]), &vecColumn); if (rc == SQLITE_ERROR) { *pzErr = sqlite3_mprintf( VEC_CONSTRUCTOR_ERROR "could not parse vector column '%s'", argv[i]); @@ -4041,30 +4270,59 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } if (rc == SQLITE_OK) { if (numVectorColumns >= VEC0_MAX_VECTOR_COLUMNS) { - sqlite3_free(c.name); + sqlite3_free(vecColumn.name); *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR "Too many provided vector columns, maximum %d", VEC0_MAX_VECTOR_COLUMNS); goto error; } - if (c.dimensions > SQLITE_VEC_VEC0_MAX_DIMENSIONS) { - sqlite3_free(c.name); + if (vecColumn.dimensions > SQLITE_VEC_VEC0_MAX_DIMENSIONS) { + sqlite3_free(vecColumn.name); *pzErr = sqlite3_mprintf( VEC_CONSTRUCTOR_ERROR "Dimension on vector column too large, provided %lld, maximum %lld", - (i64)c.dimensions, SQLITE_VEC_VEC0_MAX_DIMENSIONS); + (i64)vecColumn.dimensions, SQLITE_VEC_VEC0_MAX_DIMENSIONS); goto error; } - memcpy(&pNew->vector_columns[numVectorColumns], &c, sizeof(c)); + pNew->user_column_kinds[user_column_idx] = SQLITE_VEC0_USER_COLUMN_KIND_VECTOR; + pNew->user_column_idxs[user_column_idx] = numVectorColumns; + memcpy(&pNew->vector_columns[numVectorColumns], &vecColumn, sizeof(vecColumn)); numVectorColumns++; + user_column_idx++; + continue; } - char *cName = NULL; - int cNameLength; - int cType; - rc = parse_primary_key_definition(argv[i], strlen(argv[i]), &cName, + // Scenario #2: Constructor argument is a partition key column definition, ie `user_id text partition key` + rc = vec0_parse_partition_key_definition(argv[i], strlen(argv[i]), &cName, + &cNameLength, &cType); + if (rc == SQLITE_OK) { + if (numPartitionColumns >= VEC0_MAX_PARTITION_COLUMNS) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "More than %d partition key columns were provided", + VEC0_MAX_PARTITION_COLUMNS); + goto error; + } + partitionColumn.type = cType; + partitionColumn.name_length = cNameLength; + partitionColumn.name = sqlite3_mprintf("%.*s", cNameLength, cName); + if(!partitionColumn.name) { + rc = SQLITE_NOMEM; + goto error; + } + + pNew->user_column_kinds[user_column_idx] = SQLITE_VEC0_USER_COLUMN_KIND_PARTITION; + pNew->user_column_idxs[user_column_idx] = numPartitionColumns; + memcpy(&pNew->paritition_columns[numPartitionColumns], &partitionColumn, sizeof(partitionColumn)); + numPartitionColumns++; + user_column_idx++; + continue; + } + + // Scenario #3: Constructor argument is a primary key column definition, ie `article_id text primary key` + rc = vec0_parse_primary_key_definition(argv[i], strlen(argv[i]), &cName, &cNameLength, &cType); if (rc == SQLITE_OK) { if (pkColumnName) { @@ -4081,6 +4339,8 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, continue; } + // Scenario #4: Constructor argument is a table-level option, ie `chunk_size` + char *key; char *value; int keyLength, valueLength; @@ -4121,6 +4381,8 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } continue; } + + // Scenario #5: Unknown constructor argument *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR "Could not parse '%s'", argv[i]); goto error; @@ -4144,10 +4406,24 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } else { sqlite3_str_appendall(createStr, "rowid, "); } - for (int i = 0; i < numVectorColumns; i++) { - sqlite3_str_appendf(createStr, "\"%.*w\", ", - pNew->vector_columns[i].name_length, - pNew->vector_columns[i].name); + for (int i = 0; i < numVectorColumns + numPartitionColumns; i++) { + switch(pNew->user_column_kinds[i]) { + case SQLITE_VEC0_USER_COLUMN_KIND_VECTOR: { + int vector_idx = pNew->user_column_idxs[i]; + sqlite3_str_appendf(createStr, "\"%.*w\", ", + pNew->vector_columns[vector_idx].name_length, + pNew->vector_columns[vector_idx].name); + break; + } + case SQLITE_VEC0_USER_COLUMN_KIND_PARTITION: { + int partition_idx = pNew->user_column_idxs[i]; + sqlite3_str_appendf(createStr, "\"%.*w\", ", + pNew->paritition_columns[partition_idx].name_length, + pNew->paritition_columns[partition_idx].name); + break; + } + } + } sqlite3_str_appendall(createStr, " distance hidden, k hidden) "); if (pkColumnName) { @@ -4188,9 +4464,7 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, goto error; } pNew->numVectorColumns = numVectorColumns; - if (!pNew->numVectorColumns) { - goto error; - } + pNew->numPartitionColumns = numPartitionColumns; for (int i = 0; i < pNew->numVectorColumns; i++) { pNew->shadowVectorChunksNames[i] = sqlite3_mprintf("%s_vector_chunks%02d", tableName, i); @@ -4206,12 +4480,24 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, int rc; // create the _chunks shadow table - char *zCreateShadowChunks; - zCreateShadowChunks = sqlite3_mprintf(VEC0_SHADOW_CHUNKS_CREATE, + char *zCreateShadowChunks = NULL; + if(pNew->numPartitionColumns) { + sqlite3_str * s = sqlite3_str_new(NULL); + sqlite3_str_appendf(s, "CREATE TABLE " VEC0_SHADOW_CHUNKS_NAME "(", pNew->schemaName, pNew->tableName); + sqlite3_str_appendall(s, "chunk_id INTEGER PRIMARY KEY AUTOINCREMENT," "size INTEGER NOT NULL,"); + sqlite3_str_appendall(s, "sequence_id integer,"); + for(int i = 0; i < pNew->numPartitionColumns;i++) { + sqlite3_str_appendf(s, "partition%02d,", i); + } + sqlite3_str_appendall(s, "validity BLOB NOT NULL, rowids BLOB NOT NULL);"); + zCreateShadowChunks = sqlite3_str_finish(s); + }else { + zCreateShadowChunks = sqlite3_mprintf(VEC0_SHADOW_CHUNKS_CREATE, pNew->schemaName, pNew->tableName); - if (!zCreateShadowChunks) { - goto error; } + if (!zCreateShadowChunks) { + goto error; + } rc = sqlite3_prepare_v2(db, zCreateShadowChunks, -1, &stmt, 0); sqlite3_free((void *)zCreateShadowChunks); if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { @@ -4265,12 +4551,6 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } sqlite3_finalize(stmt); } - - rc = vec0_new_chunk(pNew, NULL); - if (rc != SQLITE_OK) { - *pzErr = sqlite3_mprintf("Could not create create an initial chunk"); - goto error; - } } *ppVtab = (sqlite3_vtab *)pNew; @@ -4372,9 +4652,30 @@ static int vec0Close(sqlite3_vtab_cursor *cur) { return SQLITE_OK; } -#define VEC0_QUERY_PLAN_FULLSCAN "fullscan" -#define VEC0_QUERY_PLAN_POINT "point" -#define VEC0_QUERY_PLAN_KNN "knn" +// All the different type of "values" provided to argv/argc in vec0Filter. +// These enums denote the use and purpose of all of them. +typedef enum { + // If any values are updated, please update the ARCHITECTURE.md docs accordingly! + + VEC0_IDXSTR_KIND_KNN_MATCH = '{', + VEC0_IDXSTR_KIND_KNN_K = '}', + VEC0_IDXSTR_KIND_KNN_ROWID_IN = '[', + VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT = ']', + VEC0_IDXSTR_KIND_POINT_ID = '!', +} vec0_idxstr_kind; + +// The different SQLITE_INDEX_CONSTRAINT values that vec0 partition key columns +// support, but as characters that fit nicely in idxstr. +typedef enum { + // If any values are updated, please update the ARCHITECTURE.md docs accordingly! + + VEC0_PARTITION_OPERATOR_EQ = 'a', + VEC0_PARTITION_OPERATOR_GT = 'b', + VEC0_PARTITION_OPERATOR_LE = 'c', + VEC0_PARTITION_OPERATOR_LT = 'd', + VEC0_PARTITION_OPERATOR_GE = 'e', + VEC0_PARTITION_OPERATOR_NE = 'f', +} vec0_partition_operator; static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { vec0_vtab *p = (vec0_vtab *)pVTab; @@ -4420,6 +4721,10 @@ static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { int iColumn = pIdxInfo->aConstraint[i].iColumn; int op = pIdxInfo->aConstraint[i].op; + + if (op == SQLITE_INDEX_CONSTRAINT_LIMIT) { + iLimitTerm = i; + } if (op == SQLITE_INDEX_CONSTRAINT_MATCH && vec0_column_idx_is_vector(p, iColumn)) { if (iMatchTerm > -1) { @@ -4430,9 +4735,6 @@ static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { iMatchTerm = i; iMatchVectorTerm = vec0_column_idx_to_vector_idx(p, iColumn); } - if (op == SQLITE_INDEX_CONSTRAINT_LIMIT) { - iLimitTerm = i; - } if (op == SQLITE_INDEX_CONSTRAINT_EQ && iColumn == VEC0_COLUMN_ID) { if (vtabIn) { if (iRowidInTerm != -1) { @@ -4450,88 +4752,167 @@ static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { iKTerm = i; } } + + sqlite3_str *idxStr = sqlite3_str_new(NULL); + int rc; + if (iMatchTerm >= 0) { if (iLimitTerm < 0 && iKTerm < 0) { vtab_set_error( pVTab, "A LIMIT or 'k = ?' constraint is required on vec0 knn queries."); - return SQLITE_ERROR; + rc = SQLITE_ERROR; + goto done; } if (iLimitTerm >= 0 && iKTerm >= 0) { vtab_set_error(pVTab, "Only LIMIT or 'k =?' can be provided, not both"); - return SQLITE_ERROR; + rc = SQLITE_ERROR; + goto done; } if (pIdxInfo->nOrderBy) { if (pIdxInfo->nOrderBy > 1) { vtab_set_error(pVTab, "Only a single 'ORDER BY distance' clause is " "allowed on vec0 KNN queries"); - return SQLITE_ERROR; + rc = SQLITE_ERROR; + goto done; } if (pIdxInfo->aOrderBy[0].iColumn != vec0_column_distance_idx(p)) { vtab_set_error(pVTab, "Only a single 'ORDER BY distance' clause is allowed on " "vec0 KNN queries, not on other columns"); - return SQLITE_ERROR; + rc = SQLITE_ERROR; + goto done; } if (pIdxInfo->aOrderBy[0].desc) { vtab_set_error( pVTab, "Only ascending in ORDER BY distance clause is supported, " "DESC is not supported yet."); - return SQLITE_ERROR; + rc = SQLITE_ERROR; + goto done; } } - pIdxInfo->aConstraintUsage[iMatchTerm].argvIndex = 1; + sqlite3_str_appendchar(idxStr, 1, VEC0_QUERY_PLAN_KNN); + + int argvIndex = 1; + pIdxInfo->aConstraintUsage[iMatchTerm].argvIndex = argvIndex++; pIdxInfo->aConstraintUsage[iMatchTerm].omit = 1; + sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_MATCH); + sqlite3_str_appendchar(idxStr, 3, '_'); if (iLimitTerm >= 0) { - pIdxInfo->aConstraintUsage[iLimitTerm].argvIndex = 2; + pIdxInfo->aConstraintUsage[iLimitTerm].argvIndex = argvIndex++; pIdxInfo->aConstraintUsage[iLimitTerm].omit = 1; } else { - pIdxInfo->aConstraintUsage[iKTerm].argvIndex = 2; + pIdxInfo->aConstraintUsage[iKTerm].argvIndex = argvIndex++; pIdxInfo->aConstraintUsage[iKTerm].omit = 1; } - - sqlite3_str *idxStr = sqlite3_str_new(NULL); - sqlite3_str_appendall(idxStr, "knn:"); -#define VEC0_IDX_KNN_ROWID_IN 'I' + sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_K); + sqlite3_str_appendchar(idxStr, 3, '_'); #if COMPILER_SUPPORTS_VTAB_IN if (iRowidInTerm >= 0) { // already validated as >= SQLite 3.38 bc iRowidInTerm is only >= 0 when // vtabIn == 1 sqlite3_vtab_in(pIdxInfo, iRowidInTerm, 1); - sqlite3_str_appendchar(idxStr, VEC0_IDX_KNN_ROWID_IN, 1); - pIdxInfo->aConstraintUsage[iRowidInTerm].argvIndex = 3; + pIdxInfo->aConstraintUsage[iRowidInTerm].argvIndex = argvIndex++; pIdxInfo->aConstraintUsage[iRowidInTerm].omit = 1; + sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_ROWID_IN); + sqlite3_str_appendchar(idxStr, 3, '_'); } #endif - pIdxInfo->idxNum = iMatchVectorTerm; - pIdxInfo->idxStr = sqlite3_str_finish(idxStr); - if (!pIdxInfo->idxStr) { - return SQLITE_NOMEM; + for (int i = 0; i < pIdxInfo->nConstraint; i++) { + if (!pIdxInfo->aConstraint[i].usable) + continue; + + int iColumn = pIdxInfo->aConstraint[i].iColumn; + int op = pIdxInfo->aConstraint[i].op; + if(op == SQLITE_INDEX_CONSTRAINT_LIMIT || op == SQLITE_INDEX_CONSTRAINT_OFFSET) { + continue; + } + if(!vec0_column_idx_is_partition(p, iColumn)) { + continue; + } + + int partition_idx = vec0_column_idx_to_partition_idx(p, iColumn); + char value = 0; + + switch(op) { + case SQLITE_INDEX_CONSTRAINT_EQ: { + value = VEC0_PARTITION_OPERATOR_EQ; + break; + } + case SQLITE_INDEX_CONSTRAINT_GT: { + value = VEC0_PARTITION_OPERATOR_GT; + break; + } + case SQLITE_INDEX_CONSTRAINT_LE: { + value = VEC0_PARTITION_OPERATOR_LE; + break; + } + case SQLITE_INDEX_CONSTRAINT_LT: { + value = VEC0_PARTITION_OPERATOR_LT; + break; + } + case SQLITE_INDEX_CONSTRAINT_GE: { + value = VEC0_PARTITION_OPERATOR_GE; + break; + } + case SQLITE_INDEX_CONSTRAINT_NE: { + value = VEC0_PARTITION_OPERATOR_NE; + break; + } + } + + if(value) { + pIdxInfo->aConstraintUsage[i].argvIndex = argvIndex++; + pIdxInfo->aConstraintUsage[i].omit = 1; + sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT); + sqlite3_str_appendchar(idxStr, 1, 'A' + partition_idx); + sqlite3_str_appendchar(idxStr, 1, value); + sqlite3_str_appendchar(idxStr, 1, '_'); + } + } - pIdxInfo->needToFreeIdxStr = 1; + + + + pIdxInfo->idxNum = iMatchVectorTerm; pIdxInfo->estimatedCost = 30.0; pIdxInfo->estimatedRows = 10; } else if (iRowidTerm >= 0) { + sqlite3_str_appendchar(idxStr, 1, VEC0_QUERY_PLAN_POINT); pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1; pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1; + sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_POINT_ID); + sqlite3_str_appendchar(idxStr, 3, '_'); pIdxInfo->idxNum = pIdxInfo->colUsed; - pIdxInfo->idxStr = VEC0_QUERY_PLAN_POINT; - pIdxInfo->needToFreeIdxStr = 0; pIdxInfo->estimatedCost = 10.0; pIdxInfo->estimatedRows = 1; } else { - pIdxInfo->idxStr = VEC0_QUERY_PLAN_FULLSCAN; + sqlite3_str_appendchar(idxStr, 1, VEC0_QUERY_PLAN_FULLSCAN); pIdxInfo->estimatedCost = 3000000.0; pIdxInfo->estimatedRows = 100000; } + pIdxInfo->idxStr = sqlite3_str_finish(idxStr); + idxStr = NULL; + if (!pIdxInfo->idxStr) { + rc = SQLITE_OK; + goto done; + } + pIdxInfo->needToFreeIdxStr = 1; - return SQLITE_OK; + + rc = SQLITE_OK; + + done: + if(idxStr) { + sqlite3_str_finish(idxStr); + } + return rc; } // forward delcaration bc vec0Filter uses it @@ -4665,6 +5046,103 @@ int min_idx(const f32 *distances, i32 n, u8 *candidates, i32 *out, i32 k, return SQLITE_OK; } +/** + * @brief Crete at "iterator" (sqlite3_stmt) of chunks with the given constraints + * + * Any VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT values in idxStr/argv will be applied + * as WHERE constraints in the underlying stmt SQL, and any consumer of the stmt + * can freely step through the stmt with all constraints satisfied. + * + * @param p - vec0_vtab + * @param idxStr - the xBestIndex/xFilter idxstr containing VEC0_IDXSTR values + * @param argc - number of argv values from xFilter + * @param argv - array of sqlite3_value from xFilter + * @param outStmt - output sqlite3_stmt of chunks with all filters applied + * @return int SQLITE_OK on success, error code otherwise + */ +int vec0_chunks_iter(vec0_vtab * p, const char * idxStr, int argc, sqlite3_value ** argv, sqlite3_stmt** outStmt) { + // always null terminated, enforced by SQLite + int idxStrLength = strlen(idxStr); + // "1" refers to the initial vec0_query_plan char, 4 is the number of chars per "element" + int numValueEntries = (idxStrLength-1) / 4; + + int rc; + sqlite3_str * s = sqlite3_str_new(NULL); + sqlite3_str_appendf(s, "select chunk_id, validity, rowids " + " from " VEC0_SHADOW_CHUNKS_NAME, + p->schemaName, p->tableName); + + int appendedWhere = 0; + for(int i = 0; i < numValueEntries; i++) { + int idx = 1 + (i * 4); + char kind = idxStr[idx + 0]; + if(kind != VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT) { + continue; + } + + int partition_idx = idxStr[idx + 1] - 'A'; + int operator = idxStr[idx + 2]; + // idxStr[idx + 3] is just null, a '_' placeholder + + if(!appendedWhere) { + sqlite3_str_appendall(s, " WHERE "); + appendedWhere = 1; + }else { + sqlite3_str_appendall(s, " AND "); + } + switch(operator) { + case VEC0_PARTITION_OPERATOR_EQ: + sqlite3_str_appendf(s, " partition%02d = ? ", partition_idx); + break; + case VEC0_PARTITION_OPERATOR_GT: + sqlite3_str_appendf(s, " partition%02d > ? ", partition_idx); + break; + case VEC0_PARTITION_OPERATOR_LE: + sqlite3_str_appendf(s, " partition%02d <= ? ", partition_idx); + break; + case VEC0_PARTITION_OPERATOR_LT: + sqlite3_str_appendf(s, " partition%02d < ? ", partition_idx); + break; + case VEC0_PARTITION_OPERATOR_GE: + sqlite3_str_appendf(s, " partition%02d >= ? ", partition_idx); + break; + case VEC0_PARTITION_OPERATOR_NE: + sqlite3_str_appendf(s, " partition%02d != ? ", partition_idx); + break; + default: { + char * zSql = sqlite3_str_finish(s); + sqlite3_free(zSql); + return SQLITE_ERROR; + } + + } + + } + + char *zSql = sqlite3_str_finish(s); + if (!zSql) { + return SQLITE_NOMEM; + } + + rc = sqlite3_prepare_v2(p->db, zSql, -1, outStmt, NULL); + sqlite3_free(zSql); + if(rc != SQLITE_OK) { + return rc; + } + + int n = 1; + for(int i = 0; i < numValueEntries; i++) { + int idx = 1 + (i * 4); + char kind = idxStr[idx + 0]; + if(kind != VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT) { + continue; + } + sqlite3_bind_value(*outStmt, n++, argv[i]); + } + + return rc; +} + int vec0Filter_knn_chunks_iter(vec0_vtab *p, sqlite3_stmt *stmtChunks, struct VectorColumnDefinition *vector_column, int vectorColumnIdx, struct Array *arrayRowidsIn, @@ -4960,8 +5438,7 @@ int vec0Filter_knn_chunks_iter(vec0_vtab *p, sqlite3_stmt *stmtChunks, int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, const char *idxStr, int argc, sqlite3_value **argv) { - UNUSED_PARAMETER(idxStr); - assert(argc >= 2); + assert(argc == (strlen(idxStr)-1) / 4); int rc; struct vec0_query_knn_data *knn_data; @@ -4982,7 +5459,25 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, } memset(knn_data, 0, sizeof(*knn_data)); - rc = vector_from_value(argv[0], &queryVector, &dimensions, &elementType, + int query_idx =-1; + int k_idx = -1; + int rowid_in_idx = -1; + for(int i = 0; i < argc; i++) { + if(idxStr[1 + (i*4)] == VEC0_IDXSTR_KIND_KNN_MATCH) { + query_idx = i; + } + if(idxStr[1 + (i*4)] == VEC0_IDXSTR_KIND_KNN_K) { + k_idx = i; + } + if(idxStr[1 + (i*4)] == VEC0_IDXSTR_KIND_KNN_ROWID_IN) { + rowid_in_idx = i; + } + } + assert(query_idx >= 0); + assert(k_idx >= 0); + + // make sure the query vector matches the vector column (type dimensions etc.) + rc = vector_from_value(argv[query_idx], &queryVector, &dimensions, &elementType, &queryVectorCleanup, &pzError); if (rc != SQLITE_OK) { @@ -5014,7 +5509,7 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, goto cleanup; } - i64 k = sqlite3_value_int64(argv[1]); + i64 k = sqlite3_value_int64(argv[k_idx]); if (k < 0) { vtab_set_error( &p->base, "k value in knn queries must be greater than or equal to 0."); @@ -5034,7 +5529,7 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, if (k == 0) { knn_data->k = 0; pCur->knn_data = knn_data; - pCur->query_plan = SQLITE_VEC0_QUERYPLAN_KNN; + pCur->query_plan = VEC0_QUERY_PLAN_KNN; rc = SQLITE_OK; goto cleanup; } @@ -5043,7 +5538,7 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, // Array of all the rowids that appear in any `rowid in (...)` constraint. // NULL if none were provided, which means a "full" scan. #if COMPILER_SUPPORTS_VTAB_IN - if (argc > 2) { + if (rowid_in_idx >= 0) { sqlite3_value *item; int rc; arrayRowidsIn = sqlite3_malloc(sizeof(*arrayRowidsIn)); @@ -5057,8 +5552,8 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, if (rc != SQLITE_OK) { goto cleanup; } - for (rc = sqlite3_vtab_in_first(argv[2], &item); rc == SQLITE_OK && item; - rc = sqlite3_vtab_in_next(argv[2], &item)) { + for (rc = sqlite3_vtab_in_first(argv[rowid_in_idx], &item); rc == SQLITE_OK && item; + rc = sqlite3_vtab_in_next(argv[rowid_in_idx], &item)) { i64 rowid; if (p->pkIsText) { rc = vec0_rowid_from_id(p, item, &rowid); @@ -5082,16 +5577,7 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, } #endif - char *zSql; - zSql = sqlite3_mprintf("select chunk_id, validity, rowids " - " from " VEC0_SHADOW_CHUNKS_NAME, - p->schemaName, p->tableName); - if (!zSql) { - rc = SQLITE_NOMEM; - goto cleanup; - } - rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtChunks, NULL); - sqlite3_free(zSql); + rc = vec0_chunks_iter(p, idxStr, argc, argv, &stmtChunks); if (rc != SQLITE_OK) { // IMP: V06942_23781 vtab_set_error(&p->base, "Error preparing stmtChunk: %s", @@ -5116,7 +5602,7 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, knn_data->k_used = k_used; pCur->knn_data = knn_data; - pCur->query_plan = SQLITE_VEC0_QUERYPLAN_KNN; + pCur->query_plan = VEC0_QUERY_PLAN_KNN; rc = SQLITE_OK; cleanup: @@ -5164,7 +5650,7 @@ int vec0Filter_fullscan(vec0_vtab *p, vec0_cursor *pCur) { } fullscan_data->done = rc == SQLITE_DONE; - pCur->query_plan = SQLITE_VEC0_QUERYPLAN_FULLSCAN; + pCur->query_plan = VEC0_QUERY_PLAN_FULLSCAN; pCur->fullscan_data = fullscan_data; return SQLITE_OK; @@ -5213,14 +5699,14 @@ int vec0Filter_point(vec0_cursor *pCur, vec0_vtab *p, int argc, point_data->rowid = rowid; point_data->done = 0; pCur->point_data = point_data; - pCur->query_plan = SQLITE_VEC0_QUERYPLAN_POINT; + pCur->query_plan = VEC0_QUERY_PLAN_POINT; return SQLITE_OK; eof: point_data->rowid = rowid; point_data->done = 1; pCur->point_data = point_data; - pCur->query_plan = SQLITE_VEC0_QUERYPLAN_POINT; + pCur->query_plan = VEC0_QUERY_PLAN_POINT; return SQLITE_OK; error: @@ -5234,30 +5720,45 @@ static int vec0Filter(sqlite3_vtab_cursor *pVtabCursor, int idxNum, vec0_vtab *p = (vec0_vtab *)pVtabCursor->pVtab; vec0_cursor *pCur = (vec0_cursor *)pVtabCursor; vec0_cursor_clear(pCur); - if (strcmp(idxStr, VEC0_QUERY_PLAN_FULLSCAN) == 0) { - return vec0Filter_fullscan(p, pCur); - } else if (strncmp(idxStr, "knn:", 4) == 0) { - return vec0Filter_knn(pCur, p, idxNum, idxStr, argc, argv); - } else if (strcmp(idxStr, VEC0_QUERY_PLAN_POINT) == 0) { - return vec0Filter_point(pCur, p, argc, argv); - } else { - vtab_set_error(pVtabCursor->pVtab, "unknown idxStr '%s'", idxStr); + + int idxStrLength = strlen(idxStr); + if(idxStrLength <= 0) { + return SQLITE_ERROR; + } + if((idxStrLength-1) % 4 != 0) { return SQLITE_ERROR; } + int numValueEntries = (idxStrLength-1) / 4; + if(numValueEntries != argc) { + return SQLITE_ERROR; + } + + char query_plan = idxStr[0]; + switch(query_plan) { + case VEC0_QUERY_PLAN_FULLSCAN: + return vec0Filter_fullscan(p, pCur); + case VEC0_QUERY_PLAN_KNN: + return vec0Filter_knn(pCur, p, idxNum, idxStr, argc, argv); + case VEC0_QUERY_PLAN_POINT: + return vec0Filter_point(pCur, p, argc, argv); + default: + vtab_set_error(pVtabCursor->pVtab, "unknown idxStr '%s'", idxStr); + return SQLITE_ERROR; + } } static int vec0Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid) { vec0_cursor *pCur = (vec0_cursor *)cur; switch (pCur->query_plan) { - case SQLITE_VEC0_QUERYPLAN_FULLSCAN: { + case VEC0_QUERY_PLAN_FULLSCAN: { *pRowid = sqlite3_column_int64(pCur->fullscan_data->rowids_stmt, 0); return SQLITE_OK; } - case SQLITE_VEC0_QUERYPLAN_POINT: { + case VEC0_QUERY_PLAN_POINT: { *pRowid = pCur->point_data->rowid; return SQLITE_OK; } - case SQLITE_VEC0_QUERYPLAN_KNN: { + case VEC0_QUERY_PLAN_KNN: { vtab_set_error(cur->pVtab, "Internal sqlite-vec error: expected point query plan in " "vec0Rowid, found %d", @@ -5271,7 +5772,7 @@ static int vec0Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid) { static int vec0Next(sqlite3_vtab_cursor *cur) { vec0_cursor *pCur = (vec0_cursor *)cur; switch (pCur->query_plan) { - case SQLITE_VEC0_QUERYPLAN_FULLSCAN: { + case VEC0_QUERY_PLAN_FULLSCAN: { if (!pCur->fullscan_data) { return SQLITE_ERROR; } @@ -5285,7 +5786,7 @@ static int vec0Next(sqlite3_vtab_cursor *cur) { } return SQLITE_ERROR; } - case SQLITE_VEC0_QUERYPLAN_KNN: { + case VEC0_QUERY_PLAN_KNN: { if (!pCur->knn_data) { return SQLITE_ERROR; } @@ -5293,7 +5794,7 @@ static int vec0Next(sqlite3_vtab_cursor *cur) { pCur->knn_data->current_idx++; return SQLITE_OK; } - case SQLITE_VEC0_QUERYPLAN_POINT: { + case VEC0_QUERY_PLAN_POINT: { if (!pCur->point_data) { return SQLITE_ERROR; } @@ -5307,13 +5808,13 @@ static int vec0Next(sqlite3_vtab_cursor *cur) { static int vec0Eof(sqlite3_vtab_cursor *cur) { vec0_cursor *pCur = (vec0_cursor *)cur; switch (pCur->query_plan) { - case SQLITE_VEC0_QUERYPLAN_FULLSCAN: { + case VEC0_QUERY_PLAN_FULLSCAN: { if (!pCur->fullscan_data) { return 1; } return pCur->fullscan_data->done; } - case SQLITE_VEC0_QUERYPLAN_KNN: { + case VEC0_QUERY_PLAN_KNN: { if (!pCur->knn_data) { return 1; } @@ -5321,7 +5822,7 @@ static int vec0Eof(sqlite3_vtab_cursor *cur) { // (pCur->knn_data->distances[pCur->knn_data->current_idx] == FLT_MAX); return (pCur->knn_data->current_idx >= pCur->knn_data->k_used); } - case SQLITE_VEC0_QUERYPLAN_POINT: { + case VEC0_QUERY_PLAN_POINT: { if (!pCur->point_data) { return 1; } @@ -5341,7 +5842,8 @@ static int vec0Column_fullscan(vec0_vtab *pVtab, vec0_cursor *pCur, i64 rowid = sqlite3_column_int64(pCur->fullscan_data->rowids_stmt, 0); if (i == VEC0_COLUMN_ID) { return vec0_result_id(pVtab, context, rowid); - } else if (vec0_column_idx_is_vector(pVtab, i)) { + } + else if (vec0_column_idx_is_vector(pVtab, i)) { void *v; int sz; int vector_idx = vec0_column_idx_to_vector_idx(pVtab, i); @@ -5353,11 +5855,20 @@ static int vec0Column_fullscan(vec0_vtab *pVtab, vec0_cursor *pCur, sqlite3_result_subtype(context, pVtab->vector_columns[vector_idx].element_type); - } else if (i == vec0_column_distance_idx(pVtab)) { - sqlite3_result_null(context); - } else { + } + else if (i == vec0_column_distance_idx(pVtab)) { sqlite3_result_null(context); } + else if(vec0_column_idx_is_partition(pVtab, i)) { + int partition_idx = vec0_column_idx_to_partition_idx(pVtab, i); + sqlite3_value * v; + int rc = vec0_get_partition_value_for_rowid(pVtab, rowid, partition_idx, &v); + if(rc == SQLITE_OK) { + sqlite3_result_value(context, v); + }else { + sqlite3_result_error_code(context, rc); + } + } return SQLITE_OK; } @@ -5371,11 +5882,11 @@ static int vec0Column_point(vec0_vtab *pVtab, vec0_cursor *pCur, if (i == VEC0_COLUMN_ID) { return vec0_result_id(pVtab, context, pCur->point_data->rowid); } - if (i == vec0_column_distance_idx(pVtab)) { + else if (i == vec0_column_distance_idx(pVtab)) { sqlite3_result_null(context); return SQLITE_OK; } - if (vec0_column_idx_is_vector(pVtab, i)) { + else if (vec0_column_idx_is_vector(pVtab, i)) { if (sqlite3_vtab_nochange(context)) { sqlite3_result_null(context); return SQLITE_OK; @@ -5389,6 +5900,20 @@ static int vec0Column_point(vec0_vtab *pVtab, vec0_cursor *pCur, pVtab->vector_columns[vector_idx].element_type); return SQLITE_OK; } + else if(vec0_column_idx_is_partition(pVtab, i)) { + if(sqlite3_vtab_nochange(context)) { + return SQLITE_OK; + } + int partition_idx = vec0_column_idx_to_partition_idx(pVtab, i); + i64 rowid = pCur->point_data->rowid; + sqlite3_value * v; + int rc = vec0_get_partition_value_for_rowid(pVtab, rowid, partition_idx, &v); + if(rc == SQLITE_OK) { + sqlite3_result_value(context, v); + }else { + sqlite3_result_error_code(context, rc); + } + } return SQLITE_OK; } @@ -5404,12 +5929,12 @@ static int vec0Column_knn(vec0_vtab *pVtab, vec0_cursor *pCur, i64 rowid = pCur->knn_data->rowids[pCur->knn_data->current_idx]; return vec0_result_id(pVtab, context, rowid); } - if (i == vec0_column_distance_idx(pVtab)) { + else if (i == vec0_column_distance_idx(pVtab)) { sqlite3_result_double( context, pCur->knn_data->distances[pCur->knn_data->current_idx]); return SQLITE_OK; } - if (vec0_column_idx_is_vector(pVtab, i)) { + else if (vec0_column_idx_is_vector(pVtab, i)) { void *out; int sz; int vector_idx = vec0_column_idx_to_vector_idx(pVtab, i); @@ -5424,6 +5949,17 @@ static int vec0Column_knn(vec0_vtab *pVtab, vec0_cursor *pCur, pVtab->vector_columns[vector_idx].element_type); return SQLITE_OK; } + else if(vec0_column_idx_is_partition(pVtab, i)) { + int partition_idx = vec0_column_idx_to_partition_idx(pVtab, i); + i64 rowid = pCur->knn_data->rowids[pCur->knn_data->current_idx]; + sqlite3_value * v; + int rc = vec0_get_partition_value_for_rowid(pVtab, rowid, partition_idx, &v); + if(rc == SQLITE_OK) { + sqlite3_result_value(context, v); + }else { + sqlite3_result_error_code(context, rc); + } + } return SQLITE_OK; } @@ -5433,13 +5969,13 @@ static int vec0Column(sqlite3_vtab_cursor *cur, sqlite3_context *context, vec0_cursor *pCur = (vec0_cursor *)cur; vec0_vtab *pVtab = (vec0_vtab *)cur->pVtab; switch (pCur->query_plan) { - case SQLITE_VEC0_QUERYPLAN_FULLSCAN: { + case VEC0_QUERY_PLAN_FULLSCAN: { return vec0Column_fullscan(pVtab, pCur, context, i); } - case SQLITE_VEC0_QUERYPLAN_KNN: { + case VEC0_QUERY_PLAN_KNN: { return vec0Column_knn(pVtab, pCur, context, i); } - case SQLITE_VEC0_QUERYPLAN_POINT: { + case VEC0_QUERY_PLAN_POINT: { return vec0Column_point(pVtab, pCur, context, i); } } @@ -5516,6 +6052,8 @@ int vec0Update_InsertRowidStep(vec0_vtab *p, sqlite3_value *idValue, * no more space in previous chunks. * * @param p: virtual table + * @param partitionKeyValues: array of partition key column values, to constrain + * against any partition key columns. * @param chunk_rowid: Output rowid of the chunk in the _chunks virtual table * that has the avialabiity. * @param chunk_offset: Output the index of the available space insert the @@ -5527,7 +6065,9 @@ int vec0Update_InsertRowidStep(vec0_vtab *p, sqlite3_value *idValue, * @return int SQLITE_OK on success, error code on failure */ int vec0Update_InsertNextAvailableStep( - vec0_vtab *p, i64 *chunk_rowid, i64 *chunk_offset, + vec0_vtab *p, + sqlite3_value ** partitionKeyValues, + i64 *chunk_rowid, i64 *chunk_offset, sqlite3_blob **blobChunksValidity, const unsigned char **bufferChunksValidity) { @@ -5535,7 +6075,10 @@ int vec0Update_InsertNextAvailableStep( i64 validitySize; *chunk_offset = -1; - rc = vec0_get_latest_chunk_rowid(p, chunk_rowid); + rc = vec0_get_latest_chunk_rowid(p, chunk_rowid, partitionKeyValues); + if(rc == SQLITE_EMPTY) { + goto done; + } if (rc != SQLITE_OK) { goto cleanup; } @@ -5598,7 +6141,7 @@ int vec0Update_InsertNextAvailableStep( done: // latest chunk was full, so need to create a new one if (*chunk_offset == -1) { - rc = vec0_new_chunk(p, chunk_rowid); + rc = vec0_new_chunk(p, partitionKeyValues, chunk_rowid); if (rc != SQLITE_OK) { // IMP: V08441_25279 vtab_set_error(&p->base, @@ -5852,6 +6395,8 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, // Array to hold cleanup functions for vectorDatas[] vector_cleanup cleanups[VEC0_MAX_VECTOR_COLUMNS]; + sqlite3_value * partitionKeyValues[VEC0_MAX_PARTITION_COLUMNS]; + // Rowid of the chunk in the _chunks shadow table that the row will be a part // of. i64 chunk_rowid; @@ -5865,26 +6410,54 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, const unsigned char *bufferChunksValidity = NULL; int numReadVectors = 0; + // Read all provided partition key values into partitionKeyValues + for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { + if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_PARTITION) { + continue; + } + int partition_key_idx = p->user_column_idxs[i]; + partitionKeyValues[partition_key_idx] = argv[2+VEC0_COLUMN_USERN_START + i]; + + int new_value_type = sqlite3_value_type(partitionKeyValues[partition_key_idx]); + if((new_value_type != SQLITE_NULL) && (new_value_type != p->paritition_columns[partition_key_idx].type)) { + // IMP: V11454_28292 + vtab_set_error( + pVTab, + "Parition key type mismatch: The partition key column %.*s has type %s, but %s was provided.", + p->paritition_columns[partition_key_idx].name_length, + p->paritition_columns[partition_key_idx].name, + type_name(p->paritition_columns[partition_key_idx].type), + type_name(new_value_type) + ); + rc = SQLITE_ERROR; + goto cleanup; + } + } + // read all the inserted vectors into vectorDatas, validate their lengths. - for (int i = 0; i < p->numVectorColumns; i++) { - sqlite3_value *valueVector = argv[2 + VEC0_COLUMN_VECTORN_START + i]; + for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { + if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_VECTOR) { + continue; + } + int vector_column_idx = p->user_column_idxs[i]; + sqlite3_value *valueVector = argv[2 + VEC0_COLUMN_USERN_START + i]; size_t dimensions; char *pzError; enum VectorElementType elementType; - rc = vector_from_value(valueVector, &vectorDatas[i], &dimensions, - &elementType, &cleanups[i], &pzError); + rc = vector_from_value(valueVector, &vectorDatas[vector_column_idx], &dimensions, + &elementType, &cleanups[vector_column_idx], &pzError); if (rc != SQLITE_OK) { // IMP: V06519_23358 vtab_set_error( pVTab, "Inserted vector for the \"%.*s\" column is invalid: %z", - p->vector_columns[i].name_length, p->vector_columns[i].name, pzError); + p->vector_columns[vector_column_idx].name_length, p->vector_columns[vector_column_idx].name, pzError); rc = SQLITE_ERROR; goto cleanup; } numReadVectors++; - if (elementType != p->vector_columns[i].element_type) { + if (elementType != p->vector_columns[vector_column_idx].element_type) { // IMP: V08221_25059 vtab_set_error( pVTab, @@ -5897,14 +6470,14 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, goto cleanup; } - if (dimensions != p->vector_columns[i].dimensions) { + if (dimensions != p->vector_columns[vector_column_idx].dimensions) { // IMP: V01145_17984 vtab_set_error( pVTab, "Dimension mismatch for inserted vector for the \"%.*s\" column. " "Expected %d dimensions but received %d.", - p->vector_columns[i].name_length, p->vector_columns[i].name, - p->vector_columns[i].dimensions, dimensions); + p->vector_columns[vector_column_idx].name_length, p->vector_columns[vector_column_idx].name, + p->vector_columns[vector_column_idx].dimensions, dimensions); rc = SQLITE_ERROR; goto cleanup; } @@ -5935,7 +6508,8 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, // Step #2: Find the next "available" position in the _chunks table for this // row. - rc = vec0Update_InsertNextAvailableStep(p, &chunk_rowid, &chunk_offset, + rc = vec0Update_InsertNextAvailableStep(p, partitionKeyValues, + &chunk_rowid, &chunk_offset, &blobChunksValidity, &bufferChunksValidity); if (rc != SQLITE_OK) { @@ -6212,16 +6786,33 @@ int vec0Update_Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv) { rowid = sqlite3_value_int64(argv[0]); } - // 1. get chunk_id and chunk_offset from _rowids + // 1) get chunk_id and chunk_offset from _rowids rc = vec0_get_chunk_position(p, rowid, NULL, &chunk_id, &chunk_offset); if (rc != SQLITE_OK) { return rc; } - // 2) iterate over all new vectors, update the vectors + // 2) update any partition key values + for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { + if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_PARTITION) { + continue; + } + int partition_key_idx = p->user_column_idxs[i]; + sqlite3_value * value = argv[2+VEC0_COLUMN_USERN_START + i]; + if(sqlite3_value_nochange(value)) { + continue; + } + vtab_set_error(pVTab, "UPDATE on partition key columns are not supported yet. "); + return SQLITE_ERROR; + } - for (int i = 0; i < p->numVectorColumns; i++) { - sqlite3_value *valueVector = argv[2 + VEC0_COLUMN_VECTORN_START + i]; + // 3) iterate over all new vectors, update the vectors + for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { + if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_VECTOR) { + continue; + } + int vector_idx = p->user_column_idxs[i]; + sqlite3_value *valueVector = argv[2 + VEC0_COLUMN_USERN_START + i]; // in vec0Column, we check sqlite3_vtab_nochange() on vector columns. // If the vector column isn't being changed, we return NULL; // That's not great, that means vector columns can never be NULLABLE @@ -6236,7 +6827,7 @@ int vec0Update_Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv) { continue; } - rc = vec0Update_UpdateVectorColumn(p, chunk_id, chunk_offset, i, + rc = vec0Update_UpdateVectorColumn(p, chunk_id, chunk_offset, vector_idx, valueVector); if (rc != SQLITE_OK) { return SQLITE_ERROR; diff --git a/test.sql b/test.sql new file mode 100644 index 0000000..7434207 --- /dev/null +++ b/test.sql @@ -0,0 +1,49 @@ +.load dist/vec0 +.echo on +.bail on + +.mode qbox + +create virtual table v using vec0(a float[1]); +select count(*) from v_chunks; +insert into v(a) values ('[1.11]'); +select * from v; +drop table v; + +create virtual table v using vec0( + + v_aaa float[1], + partk_xxx int partition key, + v_bbb float[2], + partk_yyy text partition key, + chunk_size=32 +); + + +insert into v(rowid, v_aaa, partk_xxx, v_bbb, partk_yyy) values + (1, '[.1]', 999, '[.11, .11]', 'alex'), + (2, '[.2]', 999, '[.22, .22]', 'alex'), + (3, '[.3]', 999, '[.33, .33]', 'brian'); + + +select rowid, vec_to_json(v_aaa), partk_xxx, vec_to_json(v_bbb), partk_yyy from v; + +select * from v; +select * from v where rowid = 2; +update v +set v_aaa = '[.222]', + v_bbb = '[.222, .222]' +where rowid = 2; + +select rowid, vec_to_json(v_aaa), partk_xxx, vec_to_json(v_bbb), partk_yyy from v; + +select chunk_id, size, sequence_id, partition00, partition01, (validity), length(rowids) from v_chunks; + +--explain query plan +select *, distance +from v +where v_aaa match '[.5]' + and partk_xxx = 999 + and partk_yyy = 'alex' + --and partk_xxx != 20 + and k = 5; diff --git a/tests/.python-version b/tests/.python-version new file mode 100644 index 0000000..e4fba21 --- /dev/null +++ b/tests/.python-version @@ -0,0 +1 @@ +3.12 diff --git a/tests/__snapshots__/test-partition-keys.ambr b/tests/__snapshots__/test-partition-keys.ambr new file mode 100644 index 0000000..a9dca88 --- /dev/null +++ b/tests/__snapshots__/test-partition-keys.ambr @@ -0,0 +1,245 @@ +# serializer version: 1 +# name: test_constructor_limit[max 4 partition keys] + dict({ + 'error': 'OperationalError', + 'message': 'vec0 constructor error: More than 4 partition key columns were provided', + }) +# --- +# name: test_normal[1 row] + dict({ + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + OrderedDict({ + 'chunk_id': 1, + 'size': 8, + 'sequence_id': None, + 'partition00': 100, + 'validity': b'\x01', + 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 0, + }), + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vectors': b'\x11"3D\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + }) +# --- +# name: test_normal[2 rows, same parition] + dict({ + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + OrderedDict({ + 'chunk_id': 1, + 'size': 8, + 'sequence_id': None, + 'partition00': 100, + 'validity': b'\x03', + 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 0, + }), + OrderedDict({ + 'rowid': 2, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 1, + }), + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vectors': b'\x11"3DDUfw\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + }) +# --- +# name: test_normal[3 rows, 2 partitions] + dict({ + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + OrderedDict({ + 'chunk_id': 1, + 'size': 8, + 'sequence_id': None, + 'partition00': 100, + 'validity': b'\x03', + 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + OrderedDict({ + 'chunk_id': 2, + 'size': 8, + 'sequence_id': None, + 'partition00': 200, + 'validity': b'\x01', + 'rowids': b'\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 0, + }), + OrderedDict({ + 'rowid': 2, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 1, + }), + OrderedDict({ + 'rowid': 3, + 'id': None, + 'chunk_id': 2, + 'chunk_offset': 0, + }), + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vectors': b'\x11"3DDUfw\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + OrderedDict({ + 'rowid': 2, + 'vectors': b'\x88\x99\xaa\xbb\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + }) +# --- +# name: test_types[1. raises type error] + dict({ + 'error': 'OperationalError', + 'message': 'Parition key type mismatch: The partition key column p1 has type INTEGER, but TEXT was provided.', + }) +# --- +# name: test_types[2. empty DB] + dict({ + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + ]), + }), + }) +# --- +# name: test_types[3. allow nulls] + OrderedDict({ + 'sql': 'insert into v(p1, a) values(?, ?)', + 'rows': list([ + ]), + }) +# --- +# name: test_types[4. show NULL partition key] + dict({ + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + OrderedDict({ + 'chunk_id': 1, + 'size': 8, + 'sequence_id': None, + 'partition00': None, + 'validity': b'\x01', + 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 0, + }), + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vectors': b'\x11"3D\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + }) +# --- +# name: test_updates[1. Initial dataset] + OrderedDict({ + 'sql': 'select * from v', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'p': 'a', + 'a': b'\x11\x11\x11\x11', + }), + OrderedDict({ + 'rowid': 2, + 'p': 'a', + 'a': b'""""', + }), + OrderedDict({ + 'rowid': 3, + 'p': 'a', + 'a': b'3333', + }), + ]), + }) +# --- +# name: test_updates[2. update #1] + dict({ + 'error': 'OperationalError', + 'message': 'UPDATE on partition key columns are not supported yet. ', + }) +# --- diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..9549d37 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,12 @@ +import pytest +import sqlite3 + + +@pytest.fixture() +def db(): + db = sqlite3.connect(":memory:") + db.row_factory = sqlite3.Row + db.enable_load_extension(True) + db.load_extension("dist/vec0") + db.enable_load_extension(False) + return db diff --git a/tests/pyproject.toml b/tests/pyproject.toml new file mode 100644 index 0000000..15c42c9 --- /dev/null +++ b/tests/pyproject.toml @@ -0,0 +1,9 @@ +[project] +name = "tests" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "pytest", "numpy", "syrupy" +] diff --git a/tests/test-correctness.py b/tests/skip.test-correctness.py similarity index 100% rename from tests/test-correctness.py rename to tests/skip.test-correctness.py diff --git a/tests/test-loadable.py b/tests/test-loadable.py index b1976cb..30171fe 100644 --- a/tests/test-loadable.py +++ b/tests/test-loadable.py @@ -81,7 +81,7 @@ def connect(ext, path=":memory:", extra_entrypoint=None): db = connect(EXT_PATH) -def explain_query_plan(sql): +def explain_query_plan(sql, db=db): return db.execute("explain query plan " + sql).fetchone()["detail"] @@ -1497,6 +1497,13 @@ def test_vec0_text_pk(): ] if SUPPORTS_VTAB_IN: + assert re.match( + ("SCAN (TABLE )?t VIRTUAL TABLE INDEX 0:3{___}___\[___"), + explain_query_plan( + "select t_id, distance from t where aaa match '' and k = 3 and t_id in ('t_2', 't_3')", + db=db, + ), + ) assert execute_all( db, "select t_id, distance from t where aaa match ? and k = 3 and t_id in ('t_2', 't_3')", @@ -1939,20 +1946,6 @@ def test_vec0_create_errors(): db.execute("create virtual table t1 using vec0(a float[1])") db.set_authorizer(None) - db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_INSERT, "t1_chunks")) - with _raises( - "Could not create create an initial chunk", - ): - db.execute("create virtual table t1 using vec0(a float[1])") - db.set_authorizer(None) - - db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_INSERT, "t1_vector_chunks00")) - with _raises( - "Could not create create an initial chunk", - ): - db.execute("create virtual table t1 using vec0(a float[1])") - db.set_authorizer(None) - # EVIDENCE-OF: V21406_05476 vec0 init raises error on 'latest chunk' init error db.execute("BEGIN") db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_READ, "t1_chunks", "")) @@ -2231,32 +2224,34 @@ def test_smoke(): }, ] chunk = db.execute("select * from vec_xyz_chunks").fetchone() - assert chunk["chunk_id"] == 1 - assert chunk["validity"] == bytearray(int(1024 / 8)) - assert chunk["rowids"] == bytearray(int(1024 * 8)) - vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone() - assert vchunk["rowid"] == 1 - assert vchunk["vectors"] == bytearray(int(1024 * 4 * 2)) + # as of TODO, no initial row is inside the chunks table + assert chunk is None + # assert chunk["chunk_id"] == 1 + # assert chunk["validity"] == bytearray(int(1024 / 8)) + # assert chunk["rowids"] == bytearray(int(1024 * 8)) + # vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone() + # assert vchunk["rowid"] == 1 + # assert vchunk["vectors"] == bytearray(int(1024 * 4 * 2)) assert re.match( - "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:knn:", + "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:3{___}___", explain_query_plan( "select * from vec_xyz where a match X'' and k = 10 order by distance" ), ) if SUPPORTS_VTAB_LIMIT: assert re.match( - "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:knn:", + "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:3{___}___", explain_query_plan( "select * from vec_xyz where a match X'' order by distance limit 10" ), ) assert re.match( - "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:fullscan", + "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:1", explain_query_plan("select * from vec_xyz"), ) assert re.match( - "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 3:point", + "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 3:2", explain_query_plan("select * from vec_xyz where rowid = 4"), ) diff --git a/tests/test-partition-keys.py b/tests/test-partition-keys.py new file mode 100644 index 0000000..41c7671 --- /dev/null +++ b/tests/test-partition-keys.py @@ -0,0 +1,115 @@ +import sqlite3 +from collections import OrderedDict + + +def test_constructor_limit(db, snapshot): + assert exec( + db, + """ + create virtual table v using vec0( + p1 int partition key, + p2 int partition key, + p3 int partition key, + p4 int partition key, + p5 int partition key, + v float[1] + ) + """, + ) == snapshot(name="max 4 partition keys") + + +def test_normal(db, snapshot): + db.execute( + "create virtual table v using vec0(p1 int partition key, a float[1], chunk_size=8)" + ) + + db.execute("insert into v(rowid, p1, a) values (1, 100, X'11223344')") + assert vec0_shadow_table_contents(db, "v") == snapshot(name="1 row") + db.execute("insert into v(rowid, p1, a) values (2, 100, X'44556677')") + assert vec0_shadow_table_contents(db, "v") == snapshot(name="2 rows, same parition") + db.execute("insert into v(rowid, p1, a) values (3, 200, X'8899aabb')") + assert vec0_shadow_table_contents(db, "v") == snapshot(name="3 rows, 2 partitions") + + +def test_types(db, snapshot): + db.execute( + "create virtual table v using vec0(p1 int partition key, a float[1], chunk_size=8)" + ) + + # EVIDENCE-OF: V11454_28292 + assert exec( + db, "insert into v(p1, a) values(?, ?)", ["not int", b"\x11\x22\x33\x44"] + ) == snapshot(name="1. raises type error") + + assert vec0_shadow_table_contents(db, "v") == snapshot(name="2. empty DB") + + # but allow NULLs + assert exec( + db, "insert into v(p1, a) values(?, ?)", [None, b"\x11\x22\x33\x44"] + ) == snapshot(name="3. allow nulls") + + assert vec0_shadow_table_contents(db, "v") == snapshot( + name="4. show NULL partition key" + ) + + +def test_updates(db, snapshot): + db.execute( + "create virtual table v using vec0(p text partition key, a float[1], chunk_size=8)" + ) + + db.execute( + "insert into v(rowid, p, a) values (?, ?, ?)", [1, "a", b"\x11\x11\x11\x11"] + ) + db.execute( + "insert into v(rowid, p, a) values (?, ?, ?)", [2, "a", b"\x22\x22\x22\x22"] + ) + db.execute( + "insert into v(rowid, p, a) values (?, ?, ?)", [3, "a", b"\x33\x33\x33\x33"] + ) + + assert exec(db, "select * from v") == snapshot(name="1. Initial dataset") + assert exec(db, "update v set p = ? where rowid = ?", ["new", 1]) == snapshot( + name="2. update #1" + ) + + +class Row: + def __init__(self): + pass + + def __repr__(self) -> str: + return repr() + + +def exec(db, sql, parameters=[]): + try: + rows = db.execute(sql, parameters).fetchall() + except (sqlite3.OperationalError, sqlite3.DatabaseError) as e: + return { + "error": e.__class__.__name__, + "message": str(e), + } + a = [] + for row in rows: + o = OrderedDict() + for k in row.keys(): + o[k] = row[k] + a.append(o) + result = OrderedDict() + result["sql"] = sql + result["rows"] = a + return result + + +def vec0_shadow_table_contents(db, v): + shadow_tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like ? order by 1", [f"{v}_%"] + ).fetchall() + ] + o = {} + for shadow_table in shadow_tables: + o[shadow_table] = exec(db, f"select * from {shadow_table}") + return o diff --git a/tests/test-unit.c b/tests/test-unit.c new file mode 100644 index 0000000..d9a1211 --- /dev/null +++ b/tests/test-unit.c @@ -0,0 +1,54 @@ +#include "../sqlite-vec.h" +#include +#include +#include + +#define countof(x) (sizeof(x) / sizeof((x)[0])) + +void test_vec0_parse_partition_key_definition() { + printf("Starting %s...\n", __func__); + typedef struct { + char * test; + int expected_rc; + const char *expected_column_name; + int expected_column_type; + } TestCase; + + TestCase suite[] = { + {"user_id integer partition key", SQLITE_OK, "user_id", SQLITE_INTEGER}, + {"USER_id int partition key", SQLITE_OK, "USER_id", SQLITE_INTEGER}, + {"category text partition key", SQLITE_OK, "category", SQLITE_TEXT}, + + {"", SQLITE_EMPTY, "", 0}, + {"document_id text primary key", SQLITE_EMPTY, "", 0}, + {"document_id text partition keyy", SQLITE_EMPTY, "", 0}, + }; + for(int i = 0; i < countof(suite); i++) { + char * out_column_name; + int out_column_name_length; + int out_column_type; + int rc; + rc = vec0_parse_partition_key_definition( + suite[i].test, + strlen(suite[i].test), + &out_column_name, + &out_column_name_length, + &out_column_type + ); + printf("2\n"); + assert(rc == suite[i].expected_rc); + + if(rc == SQLITE_OK) { + assert(out_column_name_length == strlen(suite[i].expected_column_name)); + assert(strncmp(out_column_name, suite[i].expected_column_name, out_column_name_length) == 0); + assert(out_column_type == suite[i].expected_column_type); + } + + printf("✅ %s\n", suite[i].test); + } +} + +int main() { + printf("Starting unit tests...\n"); + test_vec0_parse_partition_key_definition(); +} diff --git a/tests/uv.lock b/tests/uv.lock new file mode 100644 index 0000000..7e3ee4b --- /dev/null +++ b/tests/uv.lock @@ -0,0 +1,120 @@ +version = 1 +requires-python = ">=3.12" + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, +] + +[[package]] +name = "iniconfig" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/4b/cbd8e699e64a6f16ca3a8220661b5f83792b3017d0f79807cb8708d33913/iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", size = 4646 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, +] + +[[package]] +name = "numpy" +version = "2.1.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/25/ca/1166b75c21abd1da445b97bf1fa2f14f423c6cfb4fc7c4ef31dccf9f6a94/numpy-2.1.3.tar.gz", hash = "sha256:aa08e04e08aaf974d4458def539dece0d28146d866a39da5639596f4921fd761", size = 20166090 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/f0/385eb9970309643cbca4fc6eebc8bb16e560de129c91258dfaa18498da8b/numpy-2.1.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f55ba01150f52b1027829b50d70ef1dafd9821ea82905b63936668403c3b471e", size = 20849658 }, + { url = "https://files.pythonhosted.org/packages/54/4a/765b4607f0fecbb239638d610d04ec0a0ded9b4951c56dc68cef79026abf/numpy-2.1.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13138eadd4f4da03074851a698ffa7e405f41a0845a6b1ad135b81596e4e9958", size = 13492258 }, + { url = "https://files.pythonhosted.org/packages/bd/a7/2332679479c70b68dccbf4a8eb9c9b5ee383164b161bee9284ac141fbd33/numpy-2.1.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:a6b46587b14b888e95e4a24d7b13ae91fa22386c199ee7b418f449032b2fa3b8", size = 5090249 }, + { url = "https://files.pythonhosted.org/packages/c1/67/4aa00316b3b981a822c7a239d3a8135be2a6945d1fd11d0efb25d361711a/numpy-2.1.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:0fa14563cc46422e99daef53d725d0c326e99e468a9320a240affffe87852564", size = 6621704 }, + { url = "https://files.pythonhosted.org/packages/5e/da/1a429ae58b3b6c364eeec93bf044c532f2ff7b48a52e41050896cf15d5b1/numpy-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8637dcd2caa676e475503d1f8fdb327bc495554e10838019651b76d17b98e512", size = 13606089 }, + { url = "https://files.pythonhosted.org/packages/9e/3e/3757f304c704f2f0294a6b8340fcf2be244038be07da4cccf390fa678a9f/numpy-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2312b2aa89e1f43ecea6da6ea9a810d06aae08321609d8dc0d0eda6d946a541b", size = 16043185 }, + { url = "https://files.pythonhosted.org/packages/43/97/75329c28fea3113d00c8d2daf9bc5828d58d78ed661d8e05e234f86f0f6d/numpy-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a38c19106902bb19351b83802531fea19dee18e5b37b36454f27f11ff956f7fc", size = 16410751 }, + { url = "https://files.pythonhosted.org/packages/ad/7a/442965e98b34e0ae9da319f075b387bcb9a1e0658276cc63adb8c9686f7b/numpy-2.1.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:02135ade8b8a84011cbb67dc44e07c58f28575cf9ecf8ab304e51c05528c19f0", size = 14082705 }, + { url = "https://files.pythonhosted.org/packages/ac/b6/26108cf2cfa5c7e03fb969b595c93131eab4a399762b51ce9ebec2332e80/numpy-2.1.3-cp312-cp312-win32.whl", hash = "sha256:e6988e90fcf617da2b5c78902fe8e668361b43b4fe26dbf2d7b0f8034d4cafb9", size = 6239077 }, + { url = "https://files.pythonhosted.org/packages/a6/84/fa11dad3404b7634aaab50733581ce11e5350383311ea7a7010f464c0170/numpy-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:0d30c543f02e84e92c4b1f415b7c6b5326cbe45ee7882b6b77db7195fb971e3a", size = 12566858 }, + { url = "https://files.pythonhosted.org/packages/4d/0b/620591441457e25f3404c8057eb924d04f161244cb8a3680d529419aa86e/numpy-2.1.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:96fe52fcdb9345b7cd82ecd34547fca4321f7656d500eca497eb7ea5a926692f", size = 20836263 }, + { url = "https://files.pythonhosted.org/packages/45/e1/210b2d8b31ce9119145433e6ea78046e30771de3fe353f313b2778142f34/numpy-2.1.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f653490b33e9c3a4c1c01d41bc2aef08f9475af51146e4a7710c450cf9761598", size = 13507771 }, + { url = "https://files.pythonhosted.org/packages/55/44/aa9ee3caee02fa5a45f2c3b95cafe59c44e4b278fbbf895a93e88b308555/numpy-2.1.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:dc258a761a16daa791081d026f0ed4399b582712e6fc887a95af09df10c5ca57", size = 5075805 }, + { url = "https://files.pythonhosted.org/packages/78/d6/61de6e7e31915ba4d87bbe1ae859e83e6582ea14c6add07c8f7eefd8488f/numpy-2.1.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:016d0f6f5e77b0f0d45d77387ffa4bb89816b57c835580c3ce8e099ef830befe", size = 6608380 }, + { url = "https://files.pythonhosted.org/packages/3e/46/48bdf9b7241e317e6cf94276fe11ba673c06d1fdf115d8b4ebf616affd1a/numpy-2.1.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c181ba05ce8299c7aa3125c27b9c2167bca4a4445b7ce73d5febc411ca692e43", size = 13602451 }, + { url = "https://files.pythonhosted.org/packages/70/50/73f9a5aa0810cdccda9c1d20be3cbe4a4d6ea6bfd6931464a44c95eef731/numpy-2.1.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5641516794ca9e5f8a4d17bb45446998c6554704d888f86df9b200e66bdcce56", size = 16039822 }, + { url = "https://files.pythonhosted.org/packages/ad/cd/098bc1d5a5bc5307cfc65ee9369d0ca658ed88fbd7307b0d49fab6ca5fa5/numpy-2.1.3-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ea4dedd6e394a9c180b33c2c872b92f7ce0f8e7ad93e9585312b0c5a04777a4a", size = 16411822 }, + { url = "https://files.pythonhosted.org/packages/83/a2/7d4467a2a6d984549053b37945620209e702cf96a8bc658bc04bba13c9e2/numpy-2.1.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b0df3635b9c8ef48bd3be5f862cf71b0a4716fa0e702155c45067c6b711ddcef", size = 14079598 }, + { url = "https://files.pythonhosted.org/packages/e9/6a/d64514dcecb2ee70bfdfad10c42b76cab657e7ee31944ff7a600f141d9e9/numpy-2.1.3-cp313-cp313-win32.whl", hash = "sha256:50ca6aba6e163363f132b5c101ba078b8cbd3fa92c7865fd7d4d62d9779ac29f", size = 6236021 }, + { url = "https://files.pythonhosted.org/packages/bb/f9/12297ed8d8301a401e7d8eb6b418d32547f1d700ed3c038d325a605421a4/numpy-2.1.3-cp313-cp313-win_amd64.whl", hash = "sha256:747641635d3d44bcb380d950679462fae44f54b131be347d5ec2bce47d3df9ed", size = 12560405 }, + { url = "https://files.pythonhosted.org/packages/a7/45/7f9244cd792e163b334e3a7f02dff1239d2890b6f37ebf9e82cbe17debc0/numpy-2.1.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:996bb9399059c5b82f76b53ff8bb686069c05acc94656bb259b1d63d04a9506f", size = 20859062 }, + { url = "https://files.pythonhosted.org/packages/b1/b4/a084218e7e92b506d634105b13e27a3a6645312b93e1c699cc9025adb0e1/numpy-2.1.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:45966d859916ad02b779706bb43b954281db43e185015df6eb3323120188f9e4", size = 13515839 }, + { url = "https://files.pythonhosted.org/packages/27/45/58ed3f88028dcf80e6ea580311dc3edefdd94248f5770deb980500ef85dd/numpy-2.1.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:baed7e8d7481bfe0874b566850cb0b85243e982388b7b23348c6db2ee2b2ae8e", size = 5116031 }, + { url = "https://files.pythonhosted.org/packages/37/a8/eb689432eb977d83229094b58b0f53249d2209742f7de529c49d61a124a0/numpy-2.1.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:a9f7f672a3388133335589cfca93ed468509cb7b93ba3105fce780d04a6576a0", size = 6629977 }, + { url = "https://files.pythonhosted.org/packages/42/a3/5355ad51ac73c23334c7caaed01adadfda49544f646fcbfbb4331deb267b/numpy-2.1.3-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7aac50327da5d208db2eec22eb11e491e3fe13d22653dce51b0f4109101b408", size = 13575951 }, + { url = "https://files.pythonhosted.org/packages/c4/70/ea9646d203104e647988cb7d7279f135257a6b7e3354ea6c56f8bafdb095/numpy-2.1.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4394bc0dbd074b7f9b52024832d16e019decebf86caf909d94f6b3f77a8ee3b6", size = 16022655 }, + { url = "https://files.pythonhosted.org/packages/14/ce/7fc0612903e91ff9d0b3f2eda4e18ef9904814afcae5b0f08edb7f637883/numpy-2.1.3-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:50d18c4358a0a8a53f12a8ba9d772ab2d460321e6a93d6064fc22443d189853f", size = 16399902 }, + { url = "https://files.pythonhosted.org/packages/ef/62/1d3204313357591c913c32132a28f09a26357e33ea3c4e2fe81269e0dca1/numpy-2.1.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:14e253bd43fc6b37af4921b10f6add6925878a42a0c5fe83daee390bca80bc17", size = 14067180 }, + { url = "https://files.pythonhosted.org/packages/24/d7/78a40ed1d80e23a774cb8a34ae8a9493ba1b4271dde96e56ccdbab1620ef/numpy-2.1.3-cp313-cp313t-win32.whl", hash = "sha256:08788d27a5fd867a663f6fc753fd7c3ad7e92747efc73c53bca2f19f8bc06f48", size = 6291907 }, + { url = "https://files.pythonhosted.org/packages/86/09/a5ab407bd7f5f5599e6a9261f964ace03a73e7c6928de906981c31c38082/numpy-2.1.3-cp313-cp313t-win_amd64.whl", hash = "sha256:2564fbdf2b99b3f815f2107c1bbc93e2de8ee655a69c261363a1172a79a257d4", size = 12644098 }, +] + +[[package]] +name = "packaging" +version = "24.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d0/63/68dbb6eb2de9cb10ee4c9c14a0148804425e13c4fb20d61cce69f53106da/packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f", size = 163950 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 }, +] + +[[package]] +name = "pluggy" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, +] + +[[package]] +name = "pytest" +version = "8.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8b/6c/62bbd536103af674e227c41a8f3dcd022d591f6eed5facb5a0f31ee33bbc/pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181", size = 1442487 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6b/77/7440a06a8ead44c7757a64362dd22df5760f9b12dc5f11b6188cd2fc27a0/pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2", size = 342341 }, +] + +[[package]] +name = "syrupy" +version = "4.7.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/67/81/f46d234fa4ca0edcdeed973bab9acd8f8ac186537cdc850e9e84a00f61a0/syrupy-4.7.2.tar.gz", hash = "sha256:ea45e099f242de1bb53018c238f408a5bb6c82007bc687aefcbeaa0e1c2e935a", size = 49320 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b9/75/57b629fdd256efc58fb045618d603ce0b0f5fcc477f34b758e34423efb99/syrupy-4.7.2-py3-none-any.whl", hash = "sha256:eae7ba6be5aed190237caa93be288e97ca1eec5ca58760e4818972a10c4acc64", size = 49234 }, +] + +[[package]] +name = "tests" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "numpy" }, + { name = "pytest" }, + { name = "syrupy" }, +] + +[package.metadata] +requires-dist = [ + { name = "numpy" }, + { name = "pytest" }, + { name = "syrupy" }, +]