From 6658624172af9b53abeaa5311b794d67e8b241fb Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Wed, 20 Nov 2024 00:02:04 -0800 Subject: [PATCH] PARTITION KEY support (#122) * initial pass at PARTITION KEY support. * unit tests * gha this PR branch * fixup tests * doc internal * fix tests, KNN/rowids in * define SQLITE_INDEX_CONSTRAINT_OFFSET * whoops * update tests, syrupy, use uv * un ignore pyproject.toml * dot * tests/ * type error? * win: .exe, update error name * try fix macos python, paren around expr? * win bash? * dbg :( * explicit error * op * dbg win * win ./tests/.venv/Scripts/python.exe * block UPDATEs on partition key values for now --- .github/workflows/test.yaml | 135 +-- .gitignore | 1 - ARCHITECTURE.md | 54 ++ Makefile | 7 +- TODO | 5 + sqlite-vec.c | 915 ++++++++++++++---- test.sql | 49 + tests/.python-version | 1 + tests/__snapshots__/test-partition-keys.ambr | 245 +++++ tests/conftest.py | 12 + tests/pyproject.toml | 9 + ...orrectness.py => skip.test-correctness.py} | 0 tests/test-loadable.py | 45 +- tests/test-partition-keys.py | 115 +++ tests/test-unit.c | 54 ++ tests/uv.lock | 120 +++ 16 files changed, 1522 insertions(+), 245 deletions(-) create mode 100644 ARCHITECTURE.md create mode 100644 TODO create mode 100644 test.sql create mode 100644 tests/.python-version create mode 100644 tests/__snapshots__/test-partition-keys.ambr create mode 100644 tests/conftest.py create mode 100644 tests/pyproject.toml rename tests/{test-correctness.py => skip.test-correctness.py} (100%) create mode 100644 tests/test-partition-keys.py create mode 100644 tests/test-unit.c create mode 100644 tests/uv.lock diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index abb8490..96da148 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -3,6 +3,7 @@ on: push: branches: - main + - partition-by permissions: contents: read jobs: @@ -10,16 +11,92 @@ jobs: runs-on: ubuntu-20.04 steps: - uses: actions/checkout@v4 - - uses: actions/setup-python@v5 + - uses: astral-sh/setup-uv@v3 with: - python-version: "3.12" + enable-cache: true - run: ./scripts/vendor.sh - run: make loadable static - - run: pip install pytest numpy; make test-loadable + - run: uv sync --directory tests + - run: make test-loadable python=./tests/.venv/bin/python - uses: actions/upload-artifact@v4 with: name: sqlite-vec-linux-x86_64-extension path: dist/* + build-macos-x86_64-extension: + runs-on: macos-12 + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + - run: uv python install 3.12 + - run: ./scripts/vendor.sh + - run: make loadable static + - run: uv sync --directory tests + - run: make test-loadable python=./tests/.venv/bin/python + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-macos-x86_64-extension + path: dist/* + build-macos-aarch64-extension: + runs-on: macos-14 + steps: + - uses: actions/checkout@v4 + - uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + - run: ./scripts/vendor.sh + - run: make loadable static + - run: uv sync --directory tests + - run: make test-loadable python=./tests/.venv/bin/python + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-macos-aarch64-extension + path: dist/* + build-windows-x86_64-extension: + runs-on: windows-2019 + steps: + - uses: actions/checkout@v4 + - uses: ilammy/msvc-dev-cmd@v1 + - uses: astral-sh/setup-uv@v3 + with: + enable-cache: true + - run: ./scripts/vendor.sh + shell: bash + - run: make sqlite-vec.h + - run: mkdir dist + - run: cl.exe /fPIC -shared /W4 /Ivendor/ /O2 /LD sqlite-vec.c -o dist/vec0.dll + - run: uv sync --directory tests + - run: make test-loadable python=./tests/.venv/Scripts/python.exe + shell: bash + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-windows-x86_64-extension + path: dist/* + build-linux-aarch64-extension: + runs-on: ubuntu-latest + steps: + - uses: green-coding-solutions/eco-ci-energy-estimation@v4 + with: + task: start-measurement + - uses: actions/checkout@v4 + with: + version: "latest" + - run: sudo apt-get install gcc-arm-linux-gnueabihf + - run: ./scripts/vendor.sh + - run: make sqlite-vec.h + - run: make CC=arm-linux-gnueabihf-gcc loadable static + - uses: actions/upload-artifact@v4 + with: + name: sqlite-vec-linux-aarch64-extension + path: dist/* + - uses: green-coding-solutions/eco-ci-energy-estimation@v4 + with: + task: get-measurement + label: "all" + - uses: green-coding-solutions/eco-ci-energy-estimation@v4 + with: + task: display-results build-android-extensions: runs-on: ubuntu-latest strategy: @@ -98,58 +175,6 @@ jobs: with: name: sqlite-vec-${{ matrix.platforms.name }}-extension path: dist/* - build-macos-x86_64-extension: - runs-on: macos-12 - steps: - - uses: actions/checkout@v4 - - run: ./scripts/vendor.sh - - run: make loadable static - - run: /usr/local/opt/python@3/libexec/bin/python -m pip install --break-system-packages pytest numpy; make test-loadable python=/usr/local/opt/python@3/libexec/bin/python - - uses: actions/upload-artifact@v4 - with: - name: sqlite-vec-macos-x86_64-extension - path: dist/* - build-macos-aarch64-extension: - runs-on: macos-14 - steps: - - uses: actions/checkout@v4 - - run: ./scripts/vendor.sh - - run: make loadable static - - run: /opt/homebrew/opt/python3/libexec/bin/python -m pip install pytest numpy --break-system-packages; make test-loadable python=/opt/homebrew/opt/python3/libexec/bin/python - - uses: actions/upload-artifact@v4 - with: - name: sqlite-vec-macos-aarch64-extension - path: dist/* - build-windows-x86_64-extension: - runs-on: windows-2019 - steps: - - uses: actions/checkout@v4 - - uses: ilammy/msvc-dev-cmd@v1 - - uses: actions/setup-python@v5 - with: - python-version: "3.12" - - run: ./scripts/vendor.sh - shell: bash - - run: make sqlite-vec.h - - run: mkdir dist - - run: cl.exe /fPIC -shared /W4 /Ivendor/ /O2 /LD sqlite-vec.c -o dist/vec0.dll - - run: pip install pytest numpy; make test-loadable - - uses: actions/upload-artifact@v4 - with: - name: sqlite-vec-windows-x86_64-extension - path: dist/* - build-linux-aarch64-extension: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - run: sudo apt-get install gcc-arm-linux-gnueabihf - - run: ./scripts/vendor.sh - - run: make sqlite-vec.h - - run: make CC=arm-linux-gnueabihf-gcc loadable static - - uses: actions/upload-artifact@v4 - with: - name: sqlite-vec-linux-aarch64-extension - path: dist/* build-wasm32-emscripten: runs-on: ubuntu-latest steps: diff --git a/.gitignore b/.gitignore index 38f9876..ef7a661 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,3 @@ sqlite-vec.h tmp/ poetry.lock -pyproject.toml \ No newline at end of file diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md new file mode 100644 index 0000000..8ac9501 --- /dev/null +++ b/ARCHITECTURE.md @@ -0,0 +1,54 @@ +## `vec0` + +### idxStr + +The `vec0` idxStr is a string composed of single "header" character and 0 or +more "blocks" of 4 characters each. + +The "header" charcter denotes the type of query plan, as determined by the +`enum vec0_query_plan` values. The current possible values are: + +| Name | Value | Description | +| -------------------------- | ----- | ---------------------------------------------------------------------- | +| `VEC0_QUERY_PLAN_FULLSCAN` | `'1'` | Perform a full-scan on all rows | +| `VEC0_QUERY_PLAN_POINT` | `'2'` | Perform a single-lookup point query for the provided rowid | +| `VEC0_QUERY_PLAN_KNN` | `'3'` | Perform a KNN-style query on the provided query vector and parameters. | + +Each 4-character "block" is associated with a corresponding value in `argv[]`. For example, the 1st block at byte offset `1-4` (inclusive) is the 1st block and is associated with `argv[1]`. The 2nd block at byte offset `5-8` (inclusive) is associated with `argv[2]` and so on. Each block describes what kind of value or filter the given `argv[i]` value is. + + +#### `VEC0_IDXSTR_KIND_KNN_MATCH` (`'{'`) + +`argv[i]` is the query vector of the KNN query. + +The remaining 3 characters of the block are `_` fillers. + +#### `VEC0_IDXSTR_KIND_KNN_K` (`'}'`) + +`argv[i]` is the limit/k value of the KNN query. + +The remaining 3 characters of the block are `_` fillers. + +#### `VEC0_IDXSTR_KIND_KNN_ROWID_IN` (`'['`) + +`argv[i]` is the optional `rowid in (...)` value, and must be handled with [`sqlite3_vtab_in_first()` / +`sqlite3_vtab_in_next()`](https://www.sqlite.org/c3ref/vtab_in_first.html). + +The remaining 3 characters of the block are `_` fillers. + +#### `VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT` (`']'`) + +`argv[i]` is a "constraint" on a specific partition key. + +The second character of the block denotes which partition key to filter on, using `A` to denote the first partition key column, `B` for the second, etc. It is encoded with `'A' + partition_idx` and can be decoded with `c - 'A'`. + +The third character of the block denotes which operator is used in the constraint. It will be one of the values of `enum vec0_partition_operator`, as only a subset of operations are supported on partition keys. + +The fourth character of the block is a `_` filler. + + +#### `VEC0_IDXSTR_KIND_POINT_ID` (`'!'`) + +`argv[i]` is the value of the rowid or id to match against for the point query. + +The remaining 3 characters of the block are `_` fillers. diff --git a/Makefile b/Makefile index 8eb6170..1496b7a 100644 --- a/Makefile +++ b/Makefile @@ -185,13 +185,16 @@ publish-release: # -k test_vec0_update test-loadable: loadable - $(PYTHON) -m pytest -vv -s -x tests/test-loadable.py + $(PYTHON) -m pytest -vv -s -x tests/test-*.py test-loadable-snapshot-update: loadable $(PYTHON) -m pytest -vv tests/test-loadable.py --snapshot-update test-loadable-watch: - watchexec -w sqlite-vec.c -w tests/test-loadable.py -w Makefile --clear -- make test-loadable + watchexec --exts c,py,Makefile --clear -- make test-loadable + +test-unit: + $(CC) tests/test-unit.c sqlite-vec.c -I./ -Ivendor -o $(prefix)/test-unit && $(prefix)/test-unit site-dev: npm --prefix site run dev diff --git a/TODO b/TODO new file mode 100644 index 0000000..a487ddd --- /dev/null +++ b/TODO @@ -0,0 +1,5 @@ +# partition + +- [ ] UPDATE on partition key values + - remove previous row from chunk, insert into new one? +- [ ] properly sqlite3_vtab_nochange / sqlite3_value_nochange handling diff --git a/sqlite-vec.c b/sqlite-vec.c index cb4901d..caa992e 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -105,6 +105,10 @@ typedef size_t usize; #define SQLITE_INDEX_CONSTRAINT_LIMIT 73 #endif +#ifndef SQLITE_INDEX_CONSTRAINT_OFFSET +#define SQLITE_INDEX_CONSTRAINT_OFFSET 74 +#endif + #define countof(x) (sizeof(x) / sizeof((x)[0])) #define min(a, b) (((a) <= (b)) ? (a) : (b)) @@ -1930,6 +1934,83 @@ int vec0_parse_table_option(const char *source, int source_length, } return SQLITE_ERROR; } +/** + * @brief Parse an argv[i] entry of a vec0 virtual table definition, and see if + * it's a PARTITION KEY definition. + * + * @param source: argv[i] source string + * @param source_length: length of the source string + * @param out_column_name: If it is a partition key, the output column name. Same lifetime + * as source, points to specific char * + * @param out_column_name_length: Length of out_column_name in bytes + * @param out_column_type: SQLITE_TEXT or SQLITE_INTEGER. + * @return int: SQLITE_EMPTY if not a PK, SQLITE_OK if it is. + */ +int vec0_parse_partition_key_definition(const char *source, int source_length, + char **out_column_name, + int *out_column_name_length, + int *out_column_type) { + struct Vec0Scanner scanner; + struct Vec0Token token; + char *column_name; + int column_name_length; + int column_type; + vec0_scanner_init(&scanner, source, source_length); + + // Check first token is identifier, will be the column name + int rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME && + token.token_type != TOKEN_TYPE_IDENTIFIER) { + return SQLITE_EMPTY; + } + + column_name = token.start; + column_name_length = token.end - token.start; + + // Check the next token matches "text" or "integer", as column type + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME && + token.token_type != TOKEN_TYPE_IDENTIFIER) { + return SQLITE_EMPTY; + } + if (sqlite3_strnicmp(token.start, "text", token.end - token.start) == 0) { + column_type = SQLITE_TEXT; + } else if (sqlite3_strnicmp(token.start, "int", token.end - token.start) == + 0 || + sqlite3_strnicmp(token.start, "integer", + token.end - token.start) == 0) { + column_type = SQLITE_INTEGER; + } else { + return SQLITE_EMPTY; + } + + // Check the next token is identifier and matches "partition" + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME && + token.token_type != TOKEN_TYPE_IDENTIFIER) { + return SQLITE_EMPTY; + } + if (sqlite3_strnicmp(token.start, "partition", token.end - token.start) != 0) { + return SQLITE_EMPTY; + } + + // Check the next token is identifier and matches "key" + rc = vec0_scanner_next(&scanner, &token); + if (rc != VEC0_TOKEN_RESULT_SOME && + token.token_type != TOKEN_TYPE_IDENTIFIER) { + return SQLITE_EMPTY; + } + if (sqlite3_strnicmp(token.start, "key", token.end - token.start) != 0) { + return SQLITE_EMPTY; + } + + *out_column_name = column_name; + *out_column_name_length = column_name_length; + *out_column_type = column_type; + + return SQLITE_OK; +} + /** * @brief Parse an argv[i] entry of a vec0 virtual table definition, and see if * it's a PRIMARY KEY definition. @@ -1942,7 +2023,7 @@ int vec0_parse_table_option(const char *source, int source_length, * @param out_column_type: SQLITE_TEXT or SQLITE_INTEGER. * @return int: SQLITE_EMPTY if not a PK, SQLITE_OK if it is. */ -int parse_primary_key_definition(const char *source, int source_length, +int vec0_parse_primary_key_definition(const char *source, int source_length, char **out_column_name, int *out_column_name_length, int *out_column_type) { @@ -2021,6 +2102,12 @@ struct VectorColumnDefinition { enum Vec0DistanceMetrics distance_metric; }; +struct Vec0PartitionColumnDefinition { + int type; + char * name; + int name_length; +}; + size_t vector_byte_size(enum VectorElementType element_type, size_t dimensions) { switch (element_type) { @@ -2048,7 +2135,7 @@ size_t vector_column_byte_size(struct VectorColumnDefinition column) { * @return int SQLITE_OK on success, SQLITE_EMPTY is it's not a vector column * definition, SQLITE_ERROR on error. */ -int parse_vector_column(const char *source, int source_length, +int vec0_parse_vector_column(const char *source, int source_length, struct VectorColumnDefinition *outColumn) { // parses a vector column definition like so: // "abc float[123]", "abc_123 bit[1234]", eetc. @@ -3128,7 +3215,7 @@ static sqlite3_module vec_npy_eachModule = { #pragma region vec0 virtual table #define VEC0_COLUMN_ID 0 -#define VEC0_COLUMN_VECTORN_START 1 +#define VEC0_COLUMN_USERN_START 1 #define VEC0_COLUMN_OFFSET_DISTANCE 1 #define VEC0_COLUMN_OFFSET_K 2 @@ -3178,9 +3265,20 @@ static sqlite3_module vec_npy_eachModule = { typedef struct vec0_vtab vec0_vtab; -#define VEC0_MAX_VECTOR_COLUMNS 16 +#define VEC0_MAX_VECTOR_COLUMNS 16 +#define VEC0_MAX_PARTITION_COLUMNS 4 #define SQLITE_VEC_VEC0_MAX_DIMENSIONS 8192 +typedef enum { + // vector column, ie "contents_embedding float[1024]" + SQLITE_VEC0_USER_COLUMN_KIND_VECTOR = 1, + + // partition key column, ie "user_id integer partition key" + SQLITE_VEC0_USER_COLUMN_KIND_PARTITION = 2, + + // TODO: metadata + metadata filters +} vec0_user_column_kind; + struct vec0_vtab { sqlite3_vtab base; @@ -3191,6 +3289,13 @@ struct vec0_vtab { // Will change the schema of the _rowids table, and insert/query logic. int pkIsText; + // number of defined vector columns. + int numVectorColumns; + + // number of defined PARTITION KEY columns. + int numPartitionColumns; + + // Name of the schema the table exists on. // Must be freed with sqlite3_free() char *schemaName; @@ -3207,6 +3312,13 @@ struct vec0_vtab { // Must be freed with sqlite3_free() char *shadowChunksName; + // contains enum vec0_user_column_kind values for up to + // numVectorColumns + numPartitionColumns entries + uint8_t user_column_kinds[VEC0_MAX_VECTOR_COLUMNS + VEC0_MAX_PARTITION_COLUMNS]; + + uint8_t user_column_idxs[VEC0_MAX_VECTOR_COLUMNS + VEC0_MAX_PARTITION_COLUMNS]; + + // Name of all the vector chunk shadow tables. // Ex '_vector_chunks00' // Only the first numVectorColumns entries will be available. @@ -3214,9 +3326,7 @@ struct vec0_vtab { char *shadowVectorChunksNames[VEC0_MAX_VECTOR_COLUMNS]; struct VectorColumnDefinition vector_columns[VEC0_MAX_VECTOR_COLUMNS]; - - // number of defined vector columns. - int numVectorColumns; + struct Vec0PartitionColumnDefinition paritition_columns[VEC0_MAX_PARTITION_COLUMNS]; int chunk_size; @@ -3321,6 +3431,10 @@ void vec0_free(vec0_vtab *p) { } } +int vec0_num_defined_user_columns(vec0_vtab *p) { + return p->numVectorColumns + p->numPartitionColumns; +} + /** * @brief Returns the index of the distance hidden column for the given vec0 * table. @@ -3329,7 +3443,7 @@ void vec0_free(vec0_vtab *p) { * @return int */ int vec0_column_distance_idx(vec0_vtab *p) { - return VEC0_COLUMN_VECTORN_START + (p->numVectorColumns - 1) + + return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) + VEC0_COLUMN_OFFSET_DISTANCE; } @@ -3340,7 +3454,7 @@ int vec0_column_distance_idx(vec0_vtab *p) { * @return int k column index */ int vec0_column_k_idx(vec0_vtab *p) { - return VEC0_COLUMN_VECTORN_START + (p->numVectorColumns - 1) + + return VEC0_COLUMN_USERN_START + (vec0_num_defined_user_columns(p) - 1) + VEC0_COLUMN_OFFSET_K; } @@ -3349,18 +3463,36 @@ int vec0_column_k_idx(vec0_vtab *p) { * 0 otherwise. */ int vec0_column_idx_is_vector(vec0_vtab *pVtab, int column_idx) { - return column_idx >= VEC0_COLUMN_VECTORN_START && - column_idx <= - (VEC0_COLUMN_VECTORN_START + pVtab->numVectorColumns - 1); + return column_idx >= VEC0_COLUMN_USERN_START && + column_idx <= (VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(pVtab) - 1) && + pVtab->user_column_kinds[column_idx - VEC0_COLUMN_USERN_START] == SQLITE_VEC0_USER_COLUMN_KIND_VECTOR; } /** - * Returns the vector index of the given vector column index. + * Returns the vector index of the given user column index. * ONLY call if validated with vec0_column_idx_is_vector before */ int vec0_column_idx_to_vector_idx(vec0_vtab *pVtab, int column_idx) { UNUSED_PARAMETER(pVtab); - return column_idx - VEC0_COLUMN_VECTORN_START; + return pVtab->user_column_idxs[column_idx - VEC0_COLUMN_USERN_START]; +} +/** + * Returns 1 if the given column-based index is a "partition key" column, + * 0 otherwise. + */ +int vec0_column_idx_is_partition(vec0_vtab *pVtab, int column_idx) { + return column_idx >= VEC0_COLUMN_USERN_START && + column_idx <= (VEC0_COLUMN_USERN_START + vec0_num_defined_user_columns(pVtab) - 1) && + pVtab->user_column_kinds[column_idx - VEC0_COLUMN_USERN_START] == SQLITE_VEC0_USER_COLUMN_KIND_PARTITION; +} + +/** + * Returns the partition column index of the given user column index. + * ONLY call if validated with vec0_column_idx_is_vector before + */ +int vec0_column_idx_to_partition_idx(vec0_vtab *pVtab, int column_idx) { + UNUSED_PARAMETER(pVtab); + return pVtab->user_column_idxs[column_idx - VEC0_COLUMN_USERN_START]; } /** @@ -3593,13 +3725,74 @@ int vec0_get_vector_data(vec0_vtab *pVtab, i64 rowid, int vector_column_idx, return rc; } -int vec0_get_latest_chunk_rowid(vec0_vtab *p, i64 *chunk_rowid) { +/** + * @brief Retrieve the sqlite3_value of the i'th partition value for the given row. + * + * @param pVtab - the vec0_vtab in questions + * @param rowid - rowid of target row + * @param partition_idx - which partition column to retrieve + * @param outValue - output sqlite3_value + * @return int - SQLITE_OK on success, otherwise error code + */ +int vec0_get_partition_value_for_rowid(vec0_vtab *pVtab, i64 rowid, int partition_idx, sqlite3_value ** outValue) { + int rc; + i64 chunk_id; + i64 chunk_offset; + rc = vec0_get_chunk_position(pVtab, rowid, NULL, &chunk_id, &chunk_offset); + if(rc != SQLITE_OK) { + return rc; + } + sqlite3_stmt * stmt = NULL; + char * zSql = sqlite3_mprintf("SELECT partition%02d FROM " VEC0_SHADOW_CHUNKS_NAME " WHERE chunk_id = ?", partition_idx, pVtab->schemaName, pVtab->tableName); + if(!zSql) { + return SQLITE_NOMEM; + } + rc = sqlite3_prepare_v2(pVtab->db, zSql, -1, &stmt, NULL); + sqlite3_free(zSql); + if(rc != SQLITE_OK) { + return rc; + } + sqlite3_bind_int64(stmt, 1, chunk_id); + rc = sqlite3_step(stmt); + if(rc != SQLITE_ROW) { + rc = SQLITE_ERROR; + goto done; + } + *outValue = sqlite3_value_dup(sqlite3_column_value(stmt, 0)); + if(!*outValue) { + rc = SQLITE_NOMEM; + goto done; + } + rc = SQLITE_OK; + + done: + sqlite3_finalize(stmt); + return rc; + +} + +int vec0_get_latest_chunk_rowid(vec0_vtab *p, i64 *chunk_rowid, sqlite3_value ** partitionKeyValues) { int rc; const char *zSql; // lazy initialize stmtLatestChunk when needed. May be cleared during xSync() if (!p->stmtLatestChunk) { - zSql = sqlite3_mprintf("SELECT max(rowid) FROM " VEC0_SHADOW_CHUNKS_NAME, + if(p->numPartitionColumns > 0) { + sqlite3_str * s = sqlite3_str_new(NULL); + sqlite3_str_appendf(s, "SELECT max(rowid) FROM " VEC0_SHADOW_CHUNKS_NAME " WHERE ", + p->schemaName, p->tableName); + + for(int i = 0; i < p->numPartitionColumns; i++) { + if(i != 0) { + sqlite3_str_appendall(s, " AND "); + } + sqlite3_str_appendf(s, " partition%02d = ? ", i); + } + zSql = sqlite3_str_finish(s); + }else { + zSql = sqlite3_mprintf("SELECT max(rowid) FROM " VEC0_SHADOW_CHUNKS_NAME, p->schemaName, p->tableName); + } + if (!zSql) { rc = SQLITE_NOMEM; goto cleanup; @@ -3614,6 +3807,10 @@ int vec0_get_latest_chunk_rowid(vec0_vtab *p, i64 *chunk_rowid) { } } + for(int i = 0; i < p->numPartitionColumns; i++) { + sqlite3_bind_value(p->stmtLatestChunk, i+1, (partitionKeyValues[i])); + } + rc = sqlite3_step(p->stmtLatestChunk); if (rc != SQLITE_ROW) { // IMP: V31559_15629 @@ -3621,6 +3818,10 @@ int vec0_get_latest_chunk_rowid(vec0_vtab *p, i64 *chunk_rowid) { rc = SQLITE_ERROR; goto cleanup; } + if(sqlite3_column_type(p->stmtLatestChunk, 0) == SQLITE_NULL){ + rc = SQLITE_EMPTY; + goto cleanup; + } *chunk_rowid = sqlite3_column_int64(p->stmtLatestChunk, 0); rc = sqlite3_step(p->stmtLatestChunk); if (rc != SQLITE_DONE) { @@ -3636,6 +3837,7 @@ int vec0_get_latest_chunk_rowid(vec0_vtab *p, i64 *chunk_rowid) { cleanup: if (p->stmtLatestChunk) { sqlite3_reset(p->stmtLatestChunk); + sqlite3_clear_bindings(p->stmtLatestChunk); } return rc; } @@ -3825,21 +4027,39 @@ int vec0_rowids_update_position(vec0_vtab *p, i64 rowid, i64 chunk_rowid, * rowid to insert new blank rows into _vector_chunksXX tables. * * @param p: vec0 table to add new chunk - * @param chunk_rowid: Putput pointer, if not NULL, then will be filled with the + * @param paritionKeyValues: Array of partition key valeus for the new chunk, if available + * @param chunk_rowid: Output pointer, if not NULL, then will be filled with the * new chunk rowid. * @return int SQLITE_OK on success, error code otherwise. */ -int vec0_new_chunk(vec0_vtab *p, i64 *chunk_rowid) { +int vec0_new_chunk(vec0_vtab *p, sqlite3_value ** partitionKeyValues, i64 *chunk_rowid) { int rc; char *zSql; sqlite3_stmt *stmt; i64 rowid; // Step 1: Insert a new row in _chunks, capture that new rowid - zSql = sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_CHUNKS_NAME + if(p->numPartitionColumns > 0) { + sqlite3_str * s = sqlite3_str_new(NULL); + sqlite3_str_appendf(s, "INSERT INTO " VEC0_SHADOW_CHUNKS_NAME, p->schemaName, p->tableName); + sqlite3_str_appendall(s, "(size, validity, rowids"); + for(int i = 0; i < p->numPartitionColumns; i++) { + sqlite3_str_appendf(s, ", partition%02d", i); + } + sqlite3_str_appendall(s, ") VALUES (?, ?, ?"); + for(int i = 0; i < p->numPartitionColumns; i++) { + sqlite3_str_appendall(s, ", ?"); + } + sqlite3_str_appendall(s, ")"); + + zSql = sqlite3_str_finish(s); + }else { + zSql = sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_CHUNKS_NAME "(size, validity, rowids) " "VALUES (?, ?, ?);", p->schemaName, p->tableName); + } + if (!zSql) { return SQLITE_NOMEM; } @@ -3860,6 +4080,10 @@ int vec0_new_chunk(vec0_vtab *p, i64 *chunk_rowid) { sqlite3_bind_zeroblob(stmt, 2, p->chunk_size / CHAR_BIT); // validity bitmap sqlite3_bind_zeroblob(stmt, 3, p->chunk_size * sizeof(i64)); // rowids + for(int i = 0; i < p->numPartitionColumns; i++) { + sqlite3_bind_value(stmt, 4 + i, partitionKeyValues[i]); + } + rc = sqlite3_step(stmt); int failed = rc != SQLITE_DONE; rowid = sqlite3_last_insert_rowid(p->db); @@ -3876,14 +4100,18 @@ int vec0_new_chunk(vec0_vtab *p, i64 *chunk_rowid) { // Step 2: Create new vector chunks for each vector column, with // that new chunk_rowid. - for (int i = 0; i < p->numVectorColumns; i++) { + for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { + if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_VECTOR) { + continue; + } + int vector_column_idx = p->user_column_idxs[i]; i64 vectorsSize = - p->chunk_size * vector_column_byte_size(p->vector_columns[i]); + p->chunk_size * vector_column_byte_size(p->vector_columns[vector_column_idx]); zSql = sqlite3_mprintf("INSERT INTO " VEC0_SHADOW_VECTOR_N_NAME "(rowid, vectors)" "VALUES (?, ?)", - p->schemaName, p->tableName, i); + p->schemaName, p->tableName, vector_column_idx); if (!zSql) { return SQLITE_NOMEM; } @@ -3912,21 +4140,6 @@ int vec0_new_chunk(vec0_vtab *p, i64 *chunk_rowid) { return SQLITE_OK; } -// Possible query plans for xBestIndex on vec0 tables. -typedef enum { - // Full scan, every row is queried. - SQLITE_VEC0_QUERYPLAN_FULLSCAN, - - // A single row is queried by rowid/id - SQLITE_VEC0_QUERYPLAN_POINT, - - // A KNN-style query is made on a specific vector column. - // Requires - // 1) a MATCH/compatible distance contraint on a single vector column - // 2) either a 'LIMIT ?' or 'k=?' contraint - SQLITE_VEC0_QUERYPLAN_KNN, -} vec0_query_plan; - struct vec0_query_fullscan_data { sqlite3_stmt *rowids_stmt; i8 done; @@ -3979,6 +4192,14 @@ void vec0_query_point_data_clear(struct vec0_query_point_data *point_data) { } } +typedef enum { + // If any values are updated, please update the ARCHITECTURE.md docs accordingly! + + VEC0_QUERY_PLAN_FULLSCAN = '1', + VEC0_QUERY_PLAN_POINT = '2', + VEC0_QUERY_PLAN_KNN = '3', +} vec0_query_plan; + typedef struct vec0_cursor vec0_cursor; struct vec0_cursor { sqlite3_vtab_cursor base; @@ -4025,6 +4246,8 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, // option int chunk_size = -1; int numVectorColumns = 0; + int numPartitionColumns = 0; + int user_column_idx = 0; // track if a "primary key" column is defined char *pkColumnName = NULL; @@ -4032,8 +4255,14 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, int pkColumnType = SQLITE_INTEGER; for (int i = 3; i < argc; i++) { - struct VectorColumnDefinition c; - rc = parse_vector_column(argv[i], strlen(argv[i]), &c); + struct VectorColumnDefinition vecColumn; + struct Vec0PartitionColumnDefinition partitionColumn; + char *cName = NULL; + int cNameLength; + int cType; + + // Scenario #1: Constructor argument is a vector column definition, ie `foo float[1024]` + rc = vec0_parse_vector_column(argv[i], strlen(argv[i]), &vecColumn); if (rc == SQLITE_ERROR) { *pzErr = sqlite3_mprintf( VEC_CONSTRUCTOR_ERROR "could not parse vector column '%s'", argv[i]); @@ -4041,30 +4270,59 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } if (rc == SQLITE_OK) { if (numVectorColumns >= VEC0_MAX_VECTOR_COLUMNS) { - sqlite3_free(c.name); + sqlite3_free(vecColumn.name); *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR "Too many provided vector columns, maximum %d", VEC0_MAX_VECTOR_COLUMNS); goto error; } - if (c.dimensions > SQLITE_VEC_VEC0_MAX_DIMENSIONS) { - sqlite3_free(c.name); + if (vecColumn.dimensions > SQLITE_VEC_VEC0_MAX_DIMENSIONS) { + sqlite3_free(vecColumn.name); *pzErr = sqlite3_mprintf( VEC_CONSTRUCTOR_ERROR "Dimension on vector column too large, provided %lld, maximum %lld", - (i64)c.dimensions, SQLITE_VEC_VEC0_MAX_DIMENSIONS); + (i64)vecColumn.dimensions, SQLITE_VEC_VEC0_MAX_DIMENSIONS); goto error; } - memcpy(&pNew->vector_columns[numVectorColumns], &c, sizeof(c)); + pNew->user_column_kinds[user_column_idx] = SQLITE_VEC0_USER_COLUMN_KIND_VECTOR; + pNew->user_column_idxs[user_column_idx] = numVectorColumns; + memcpy(&pNew->vector_columns[numVectorColumns], &vecColumn, sizeof(vecColumn)); numVectorColumns++; + user_column_idx++; + continue; } - char *cName = NULL; - int cNameLength; - int cType; - rc = parse_primary_key_definition(argv[i], strlen(argv[i]), &cName, + // Scenario #2: Constructor argument is a partition key column definition, ie `user_id text partition key` + rc = vec0_parse_partition_key_definition(argv[i], strlen(argv[i]), &cName, + &cNameLength, &cType); + if (rc == SQLITE_OK) { + if (numPartitionColumns >= VEC0_MAX_PARTITION_COLUMNS) { + *pzErr = sqlite3_mprintf( + VEC_CONSTRUCTOR_ERROR + "More than %d partition key columns were provided", + VEC0_MAX_PARTITION_COLUMNS); + goto error; + } + partitionColumn.type = cType; + partitionColumn.name_length = cNameLength; + partitionColumn.name = sqlite3_mprintf("%.*s", cNameLength, cName); + if(!partitionColumn.name) { + rc = SQLITE_NOMEM; + goto error; + } + + pNew->user_column_kinds[user_column_idx] = SQLITE_VEC0_USER_COLUMN_KIND_PARTITION; + pNew->user_column_idxs[user_column_idx] = numPartitionColumns; + memcpy(&pNew->paritition_columns[numPartitionColumns], &partitionColumn, sizeof(partitionColumn)); + numPartitionColumns++; + user_column_idx++; + continue; + } + + // Scenario #3: Constructor argument is a primary key column definition, ie `article_id text primary key` + rc = vec0_parse_primary_key_definition(argv[i], strlen(argv[i]), &cName, &cNameLength, &cType); if (rc == SQLITE_OK) { if (pkColumnName) { @@ -4081,6 +4339,8 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, continue; } + // Scenario #4: Constructor argument is a table-level option, ie `chunk_size` + char *key; char *value; int keyLength, valueLength; @@ -4121,6 +4381,8 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } continue; } + + // Scenario #5: Unknown constructor argument *pzErr = sqlite3_mprintf(VEC_CONSTRUCTOR_ERROR "Could not parse '%s'", argv[i]); goto error; @@ -4144,10 +4406,24 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } else { sqlite3_str_appendall(createStr, "rowid, "); } - for (int i = 0; i < numVectorColumns; i++) { - sqlite3_str_appendf(createStr, "\"%.*w\", ", - pNew->vector_columns[i].name_length, - pNew->vector_columns[i].name); + for (int i = 0; i < numVectorColumns + numPartitionColumns; i++) { + switch(pNew->user_column_kinds[i]) { + case SQLITE_VEC0_USER_COLUMN_KIND_VECTOR: { + int vector_idx = pNew->user_column_idxs[i]; + sqlite3_str_appendf(createStr, "\"%.*w\", ", + pNew->vector_columns[vector_idx].name_length, + pNew->vector_columns[vector_idx].name); + break; + } + case SQLITE_VEC0_USER_COLUMN_KIND_PARTITION: { + int partition_idx = pNew->user_column_idxs[i]; + sqlite3_str_appendf(createStr, "\"%.*w\", ", + pNew->paritition_columns[partition_idx].name_length, + pNew->paritition_columns[partition_idx].name); + break; + } + } + } sqlite3_str_appendall(createStr, " distance hidden, k hidden) "); if (pkColumnName) { @@ -4188,9 +4464,7 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, goto error; } pNew->numVectorColumns = numVectorColumns; - if (!pNew->numVectorColumns) { - goto error; - } + pNew->numPartitionColumns = numPartitionColumns; for (int i = 0; i < pNew->numVectorColumns; i++) { pNew->shadowVectorChunksNames[i] = sqlite3_mprintf("%s_vector_chunks%02d", tableName, i); @@ -4206,12 +4480,24 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, int rc; // create the _chunks shadow table - char *zCreateShadowChunks; - zCreateShadowChunks = sqlite3_mprintf(VEC0_SHADOW_CHUNKS_CREATE, + char *zCreateShadowChunks = NULL; + if(pNew->numPartitionColumns) { + sqlite3_str * s = sqlite3_str_new(NULL); + sqlite3_str_appendf(s, "CREATE TABLE " VEC0_SHADOW_CHUNKS_NAME "(", pNew->schemaName, pNew->tableName); + sqlite3_str_appendall(s, "chunk_id INTEGER PRIMARY KEY AUTOINCREMENT," "size INTEGER NOT NULL,"); + sqlite3_str_appendall(s, "sequence_id integer,"); + for(int i = 0; i < pNew->numPartitionColumns;i++) { + sqlite3_str_appendf(s, "partition%02d,", i); + } + sqlite3_str_appendall(s, "validity BLOB NOT NULL, rowids BLOB NOT NULL);"); + zCreateShadowChunks = sqlite3_str_finish(s); + }else { + zCreateShadowChunks = sqlite3_mprintf(VEC0_SHADOW_CHUNKS_CREATE, pNew->schemaName, pNew->tableName); - if (!zCreateShadowChunks) { - goto error; } + if (!zCreateShadowChunks) { + goto error; + } rc = sqlite3_prepare_v2(db, zCreateShadowChunks, -1, &stmt, 0); sqlite3_free((void *)zCreateShadowChunks); if ((rc != SQLITE_OK) || (sqlite3_step(stmt) != SQLITE_DONE)) { @@ -4265,12 +4551,6 @@ static int vec0_init(sqlite3 *db, void *pAux, int argc, const char *const *argv, } sqlite3_finalize(stmt); } - - rc = vec0_new_chunk(pNew, NULL); - if (rc != SQLITE_OK) { - *pzErr = sqlite3_mprintf("Could not create create an initial chunk"); - goto error; - } } *ppVtab = (sqlite3_vtab *)pNew; @@ -4372,9 +4652,30 @@ static int vec0Close(sqlite3_vtab_cursor *cur) { return SQLITE_OK; } -#define VEC0_QUERY_PLAN_FULLSCAN "fullscan" -#define VEC0_QUERY_PLAN_POINT "point" -#define VEC0_QUERY_PLAN_KNN "knn" +// All the different type of "values" provided to argv/argc in vec0Filter. +// These enums denote the use and purpose of all of them. +typedef enum { + // If any values are updated, please update the ARCHITECTURE.md docs accordingly! + + VEC0_IDXSTR_KIND_KNN_MATCH = '{', + VEC0_IDXSTR_KIND_KNN_K = '}', + VEC0_IDXSTR_KIND_KNN_ROWID_IN = '[', + VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT = ']', + VEC0_IDXSTR_KIND_POINT_ID = '!', +} vec0_idxstr_kind; + +// The different SQLITE_INDEX_CONSTRAINT values that vec0 partition key columns +// support, but as characters that fit nicely in idxstr. +typedef enum { + // If any values are updated, please update the ARCHITECTURE.md docs accordingly! + + VEC0_PARTITION_OPERATOR_EQ = 'a', + VEC0_PARTITION_OPERATOR_GT = 'b', + VEC0_PARTITION_OPERATOR_LE = 'c', + VEC0_PARTITION_OPERATOR_LT = 'd', + VEC0_PARTITION_OPERATOR_GE = 'e', + VEC0_PARTITION_OPERATOR_NE = 'f', +} vec0_partition_operator; static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { vec0_vtab *p = (vec0_vtab *)pVTab; @@ -4420,6 +4721,10 @@ static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { int iColumn = pIdxInfo->aConstraint[i].iColumn; int op = pIdxInfo->aConstraint[i].op; + + if (op == SQLITE_INDEX_CONSTRAINT_LIMIT) { + iLimitTerm = i; + } if (op == SQLITE_INDEX_CONSTRAINT_MATCH && vec0_column_idx_is_vector(p, iColumn)) { if (iMatchTerm > -1) { @@ -4430,9 +4735,6 @@ static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { iMatchTerm = i; iMatchVectorTerm = vec0_column_idx_to_vector_idx(p, iColumn); } - if (op == SQLITE_INDEX_CONSTRAINT_LIMIT) { - iLimitTerm = i; - } if (op == SQLITE_INDEX_CONSTRAINT_EQ && iColumn == VEC0_COLUMN_ID) { if (vtabIn) { if (iRowidInTerm != -1) { @@ -4450,88 +4752,167 @@ static int vec0BestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pIdxInfo) { iKTerm = i; } } + + sqlite3_str *idxStr = sqlite3_str_new(NULL); + int rc; + if (iMatchTerm >= 0) { if (iLimitTerm < 0 && iKTerm < 0) { vtab_set_error( pVTab, "A LIMIT or 'k = ?' constraint is required on vec0 knn queries."); - return SQLITE_ERROR; + rc = SQLITE_ERROR; + goto done; } if (iLimitTerm >= 0 && iKTerm >= 0) { vtab_set_error(pVTab, "Only LIMIT or 'k =?' can be provided, not both"); - return SQLITE_ERROR; + rc = SQLITE_ERROR; + goto done; } if (pIdxInfo->nOrderBy) { if (pIdxInfo->nOrderBy > 1) { vtab_set_error(pVTab, "Only a single 'ORDER BY distance' clause is " "allowed on vec0 KNN queries"); - return SQLITE_ERROR; + rc = SQLITE_ERROR; + goto done; } if (pIdxInfo->aOrderBy[0].iColumn != vec0_column_distance_idx(p)) { vtab_set_error(pVTab, "Only a single 'ORDER BY distance' clause is allowed on " "vec0 KNN queries, not on other columns"); - return SQLITE_ERROR; + rc = SQLITE_ERROR; + goto done; } if (pIdxInfo->aOrderBy[0].desc) { vtab_set_error( pVTab, "Only ascending in ORDER BY distance clause is supported, " "DESC is not supported yet."); - return SQLITE_ERROR; + rc = SQLITE_ERROR; + goto done; } } - pIdxInfo->aConstraintUsage[iMatchTerm].argvIndex = 1; + sqlite3_str_appendchar(idxStr, 1, VEC0_QUERY_PLAN_KNN); + + int argvIndex = 1; + pIdxInfo->aConstraintUsage[iMatchTerm].argvIndex = argvIndex++; pIdxInfo->aConstraintUsage[iMatchTerm].omit = 1; + sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_MATCH); + sqlite3_str_appendchar(idxStr, 3, '_'); if (iLimitTerm >= 0) { - pIdxInfo->aConstraintUsage[iLimitTerm].argvIndex = 2; + pIdxInfo->aConstraintUsage[iLimitTerm].argvIndex = argvIndex++; pIdxInfo->aConstraintUsage[iLimitTerm].omit = 1; } else { - pIdxInfo->aConstraintUsage[iKTerm].argvIndex = 2; + pIdxInfo->aConstraintUsage[iKTerm].argvIndex = argvIndex++; pIdxInfo->aConstraintUsage[iKTerm].omit = 1; } - - sqlite3_str *idxStr = sqlite3_str_new(NULL); - sqlite3_str_appendall(idxStr, "knn:"); -#define VEC0_IDX_KNN_ROWID_IN 'I' + sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_K); + sqlite3_str_appendchar(idxStr, 3, '_'); #if COMPILER_SUPPORTS_VTAB_IN if (iRowidInTerm >= 0) { // already validated as >= SQLite 3.38 bc iRowidInTerm is only >= 0 when // vtabIn == 1 sqlite3_vtab_in(pIdxInfo, iRowidInTerm, 1); - sqlite3_str_appendchar(idxStr, VEC0_IDX_KNN_ROWID_IN, 1); - pIdxInfo->aConstraintUsage[iRowidInTerm].argvIndex = 3; + pIdxInfo->aConstraintUsage[iRowidInTerm].argvIndex = argvIndex++; pIdxInfo->aConstraintUsage[iRowidInTerm].omit = 1; + sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_ROWID_IN); + sqlite3_str_appendchar(idxStr, 3, '_'); } #endif - pIdxInfo->idxNum = iMatchVectorTerm; - pIdxInfo->idxStr = sqlite3_str_finish(idxStr); - if (!pIdxInfo->idxStr) { - return SQLITE_NOMEM; + for (int i = 0; i < pIdxInfo->nConstraint; i++) { + if (!pIdxInfo->aConstraint[i].usable) + continue; + + int iColumn = pIdxInfo->aConstraint[i].iColumn; + int op = pIdxInfo->aConstraint[i].op; + if(op == SQLITE_INDEX_CONSTRAINT_LIMIT || op == SQLITE_INDEX_CONSTRAINT_OFFSET) { + continue; + } + if(!vec0_column_idx_is_partition(p, iColumn)) { + continue; + } + + int partition_idx = vec0_column_idx_to_partition_idx(p, iColumn); + char value = 0; + + switch(op) { + case SQLITE_INDEX_CONSTRAINT_EQ: { + value = VEC0_PARTITION_OPERATOR_EQ; + break; + } + case SQLITE_INDEX_CONSTRAINT_GT: { + value = VEC0_PARTITION_OPERATOR_GT; + break; + } + case SQLITE_INDEX_CONSTRAINT_LE: { + value = VEC0_PARTITION_OPERATOR_LE; + break; + } + case SQLITE_INDEX_CONSTRAINT_LT: { + value = VEC0_PARTITION_OPERATOR_LT; + break; + } + case SQLITE_INDEX_CONSTRAINT_GE: { + value = VEC0_PARTITION_OPERATOR_GE; + break; + } + case SQLITE_INDEX_CONSTRAINT_NE: { + value = VEC0_PARTITION_OPERATOR_NE; + break; + } + } + + if(value) { + pIdxInfo->aConstraintUsage[i].argvIndex = argvIndex++; + pIdxInfo->aConstraintUsage[i].omit = 1; + sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT); + sqlite3_str_appendchar(idxStr, 1, 'A' + partition_idx); + sqlite3_str_appendchar(idxStr, 1, value); + sqlite3_str_appendchar(idxStr, 1, '_'); + } + } - pIdxInfo->needToFreeIdxStr = 1; + + + + pIdxInfo->idxNum = iMatchVectorTerm; pIdxInfo->estimatedCost = 30.0; pIdxInfo->estimatedRows = 10; } else if (iRowidTerm >= 0) { + sqlite3_str_appendchar(idxStr, 1, VEC0_QUERY_PLAN_POINT); pIdxInfo->aConstraintUsage[iRowidTerm].argvIndex = 1; pIdxInfo->aConstraintUsage[iRowidTerm].omit = 1; + sqlite3_str_appendchar(idxStr, 1, VEC0_IDXSTR_KIND_POINT_ID); + sqlite3_str_appendchar(idxStr, 3, '_'); pIdxInfo->idxNum = pIdxInfo->colUsed; - pIdxInfo->idxStr = VEC0_QUERY_PLAN_POINT; - pIdxInfo->needToFreeIdxStr = 0; pIdxInfo->estimatedCost = 10.0; pIdxInfo->estimatedRows = 1; } else { - pIdxInfo->idxStr = VEC0_QUERY_PLAN_FULLSCAN; + sqlite3_str_appendchar(idxStr, 1, VEC0_QUERY_PLAN_FULLSCAN); pIdxInfo->estimatedCost = 3000000.0; pIdxInfo->estimatedRows = 100000; } + pIdxInfo->idxStr = sqlite3_str_finish(idxStr); + idxStr = NULL; + if (!pIdxInfo->idxStr) { + rc = SQLITE_OK; + goto done; + } + pIdxInfo->needToFreeIdxStr = 1; - return SQLITE_OK; + + rc = SQLITE_OK; + + done: + if(idxStr) { + sqlite3_str_finish(idxStr); + } + return rc; } // forward delcaration bc vec0Filter uses it @@ -4665,6 +5046,103 @@ int min_idx(const f32 *distances, i32 n, u8 *candidates, i32 *out, i32 k, return SQLITE_OK; } +/** + * @brief Crete at "iterator" (sqlite3_stmt) of chunks with the given constraints + * + * Any VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT values in idxStr/argv will be applied + * as WHERE constraints in the underlying stmt SQL, and any consumer of the stmt + * can freely step through the stmt with all constraints satisfied. + * + * @param p - vec0_vtab + * @param idxStr - the xBestIndex/xFilter idxstr containing VEC0_IDXSTR values + * @param argc - number of argv values from xFilter + * @param argv - array of sqlite3_value from xFilter + * @param outStmt - output sqlite3_stmt of chunks with all filters applied + * @return int SQLITE_OK on success, error code otherwise + */ +int vec0_chunks_iter(vec0_vtab * p, const char * idxStr, int argc, sqlite3_value ** argv, sqlite3_stmt** outStmt) { + // always null terminated, enforced by SQLite + int idxStrLength = strlen(idxStr); + // "1" refers to the initial vec0_query_plan char, 4 is the number of chars per "element" + int numValueEntries = (idxStrLength-1) / 4; + + int rc; + sqlite3_str * s = sqlite3_str_new(NULL); + sqlite3_str_appendf(s, "select chunk_id, validity, rowids " + " from " VEC0_SHADOW_CHUNKS_NAME, + p->schemaName, p->tableName); + + int appendedWhere = 0; + for(int i = 0; i < numValueEntries; i++) { + int idx = 1 + (i * 4); + char kind = idxStr[idx + 0]; + if(kind != VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT) { + continue; + } + + int partition_idx = idxStr[idx + 1] - 'A'; + int operator = idxStr[idx + 2]; + // idxStr[idx + 3] is just null, a '_' placeholder + + if(!appendedWhere) { + sqlite3_str_appendall(s, " WHERE "); + appendedWhere = 1; + }else { + sqlite3_str_appendall(s, " AND "); + } + switch(operator) { + case VEC0_PARTITION_OPERATOR_EQ: + sqlite3_str_appendf(s, " partition%02d = ? ", partition_idx); + break; + case VEC0_PARTITION_OPERATOR_GT: + sqlite3_str_appendf(s, " partition%02d > ? ", partition_idx); + break; + case VEC0_PARTITION_OPERATOR_LE: + sqlite3_str_appendf(s, " partition%02d <= ? ", partition_idx); + break; + case VEC0_PARTITION_OPERATOR_LT: + sqlite3_str_appendf(s, " partition%02d < ? ", partition_idx); + break; + case VEC0_PARTITION_OPERATOR_GE: + sqlite3_str_appendf(s, " partition%02d >= ? ", partition_idx); + break; + case VEC0_PARTITION_OPERATOR_NE: + sqlite3_str_appendf(s, " partition%02d != ? ", partition_idx); + break; + default: { + char * zSql = sqlite3_str_finish(s); + sqlite3_free(zSql); + return SQLITE_ERROR; + } + + } + + } + + char *zSql = sqlite3_str_finish(s); + if (!zSql) { + return SQLITE_NOMEM; + } + + rc = sqlite3_prepare_v2(p->db, zSql, -1, outStmt, NULL); + sqlite3_free(zSql); + if(rc != SQLITE_OK) { + return rc; + } + + int n = 1; + for(int i = 0; i < numValueEntries; i++) { + int idx = 1 + (i * 4); + char kind = idxStr[idx + 0]; + if(kind != VEC0_IDXSTR_KIND_KNN_PARTITON_CONSTRAINT) { + continue; + } + sqlite3_bind_value(*outStmt, n++, argv[i]); + } + + return rc; +} + int vec0Filter_knn_chunks_iter(vec0_vtab *p, sqlite3_stmt *stmtChunks, struct VectorColumnDefinition *vector_column, int vectorColumnIdx, struct Array *arrayRowidsIn, @@ -4960,8 +5438,7 @@ int vec0Filter_knn_chunks_iter(vec0_vtab *p, sqlite3_stmt *stmtChunks, int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, const char *idxStr, int argc, sqlite3_value **argv) { - UNUSED_PARAMETER(idxStr); - assert(argc >= 2); + assert(argc == (strlen(idxStr)-1) / 4); int rc; struct vec0_query_knn_data *knn_data; @@ -4982,7 +5459,25 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, } memset(knn_data, 0, sizeof(*knn_data)); - rc = vector_from_value(argv[0], &queryVector, &dimensions, &elementType, + int query_idx =-1; + int k_idx = -1; + int rowid_in_idx = -1; + for(int i = 0; i < argc; i++) { + if(idxStr[1 + (i*4)] == VEC0_IDXSTR_KIND_KNN_MATCH) { + query_idx = i; + } + if(idxStr[1 + (i*4)] == VEC0_IDXSTR_KIND_KNN_K) { + k_idx = i; + } + if(idxStr[1 + (i*4)] == VEC0_IDXSTR_KIND_KNN_ROWID_IN) { + rowid_in_idx = i; + } + } + assert(query_idx >= 0); + assert(k_idx >= 0); + + // make sure the query vector matches the vector column (type dimensions etc.) + rc = vector_from_value(argv[query_idx], &queryVector, &dimensions, &elementType, &queryVectorCleanup, &pzError); if (rc != SQLITE_OK) { @@ -5014,7 +5509,7 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, goto cleanup; } - i64 k = sqlite3_value_int64(argv[1]); + i64 k = sqlite3_value_int64(argv[k_idx]); if (k < 0) { vtab_set_error( &p->base, "k value in knn queries must be greater than or equal to 0."); @@ -5034,7 +5529,7 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, if (k == 0) { knn_data->k = 0; pCur->knn_data = knn_data; - pCur->query_plan = SQLITE_VEC0_QUERYPLAN_KNN; + pCur->query_plan = VEC0_QUERY_PLAN_KNN; rc = SQLITE_OK; goto cleanup; } @@ -5043,7 +5538,7 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, // Array of all the rowids that appear in any `rowid in (...)` constraint. // NULL if none were provided, which means a "full" scan. #if COMPILER_SUPPORTS_VTAB_IN - if (argc > 2) { + if (rowid_in_idx >= 0) { sqlite3_value *item; int rc; arrayRowidsIn = sqlite3_malloc(sizeof(*arrayRowidsIn)); @@ -5057,8 +5552,8 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, if (rc != SQLITE_OK) { goto cleanup; } - for (rc = sqlite3_vtab_in_first(argv[2], &item); rc == SQLITE_OK && item; - rc = sqlite3_vtab_in_next(argv[2], &item)) { + for (rc = sqlite3_vtab_in_first(argv[rowid_in_idx], &item); rc == SQLITE_OK && item; + rc = sqlite3_vtab_in_next(argv[rowid_in_idx], &item)) { i64 rowid; if (p->pkIsText) { rc = vec0_rowid_from_id(p, item, &rowid); @@ -5082,16 +5577,7 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, } #endif - char *zSql; - zSql = sqlite3_mprintf("select chunk_id, validity, rowids " - " from " VEC0_SHADOW_CHUNKS_NAME, - p->schemaName, p->tableName); - if (!zSql) { - rc = SQLITE_NOMEM; - goto cleanup; - } - rc = sqlite3_prepare_v2(p->db, zSql, -1, &stmtChunks, NULL); - sqlite3_free(zSql); + rc = vec0_chunks_iter(p, idxStr, argc, argv, &stmtChunks); if (rc != SQLITE_OK) { // IMP: V06942_23781 vtab_set_error(&p->base, "Error preparing stmtChunk: %s", @@ -5116,7 +5602,7 @@ int vec0Filter_knn(vec0_cursor *pCur, vec0_vtab *p, int idxNum, knn_data->k_used = k_used; pCur->knn_data = knn_data; - pCur->query_plan = SQLITE_VEC0_QUERYPLAN_KNN; + pCur->query_plan = VEC0_QUERY_PLAN_KNN; rc = SQLITE_OK; cleanup: @@ -5164,7 +5650,7 @@ int vec0Filter_fullscan(vec0_vtab *p, vec0_cursor *pCur) { } fullscan_data->done = rc == SQLITE_DONE; - pCur->query_plan = SQLITE_VEC0_QUERYPLAN_FULLSCAN; + pCur->query_plan = VEC0_QUERY_PLAN_FULLSCAN; pCur->fullscan_data = fullscan_data; return SQLITE_OK; @@ -5213,14 +5699,14 @@ int vec0Filter_point(vec0_cursor *pCur, vec0_vtab *p, int argc, point_data->rowid = rowid; point_data->done = 0; pCur->point_data = point_data; - pCur->query_plan = SQLITE_VEC0_QUERYPLAN_POINT; + pCur->query_plan = VEC0_QUERY_PLAN_POINT; return SQLITE_OK; eof: point_data->rowid = rowid; point_data->done = 1; pCur->point_data = point_data; - pCur->query_plan = SQLITE_VEC0_QUERYPLAN_POINT; + pCur->query_plan = VEC0_QUERY_PLAN_POINT; return SQLITE_OK; error: @@ -5234,30 +5720,45 @@ static int vec0Filter(sqlite3_vtab_cursor *pVtabCursor, int idxNum, vec0_vtab *p = (vec0_vtab *)pVtabCursor->pVtab; vec0_cursor *pCur = (vec0_cursor *)pVtabCursor; vec0_cursor_clear(pCur); - if (strcmp(idxStr, VEC0_QUERY_PLAN_FULLSCAN) == 0) { - return vec0Filter_fullscan(p, pCur); - } else if (strncmp(idxStr, "knn:", 4) == 0) { - return vec0Filter_knn(pCur, p, idxNum, idxStr, argc, argv); - } else if (strcmp(idxStr, VEC0_QUERY_PLAN_POINT) == 0) { - return vec0Filter_point(pCur, p, argc, argv); - } else { - vtab_set_error(pVtabCursor->pVtab, "unknown idxStr '%s'", idxStr); + + int idxStrLength = strlen(idxStr); + if(idxStrLength <= 0) { + return SQLITE_ERROR; + } + if((idxStrLength-1) % 4 != 0) { return SQLITE_ERROR; } + int numValueEntries = (idxStrLength-1) / 4; + if(numValueEntries != argc) { + return SQLITE_ERROR; + } + + char query_plan = idxStr[0]; + switch(query_plan) { + case VEC0_QUERY_PLAN_FULLSCAN: + return vec0Filter_fullscan(p, pCur); + case VEC0_QUERY_PLAN_KNN: + return vec0Filter_knn(pCur, p, idxNum, idxStr, argc, argv); + case VEC0_QUERY_PLAN_POINT: + return vec0Filter_point(pCur, p, argc, argv); + default: + vtab_set_error(pVtabCursor->pVtab, "unknown idxStr '%s'", idxStr); + return SQLITE_ERROR; + } } static int vec0Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid) { vec0_cursor *pCur = (vec0_cursor *)cur; switch (pCur->query_plan) { - case SQLITE_VEC0_QUERYPLAN_FULLSCAN: { + case VEC0_QUERY_PLAN_FULLSCAN: { *pRowid = sqlite3_column_int64(pCur->fullscan_data->rowids_stmt, 0); return SQLITE_OK; } - case SQLITE_VEC0_QUERYPLAN_POINT: { + case VEC0_QUERY_PLAN_POINT: { *pRowid = pCur->point_data->rowid; return SQLITE_OK; } - case SQLITE_VEC0_QUERYPLAN_KNN: { + case VEC0_QUERY_PLAN_KNN: { vtab_set_error(cur->pVtab, "Internal sqlite-vec error: expected point query plan in " "vec0Rowid, found %d", @@ -5271,7 +5772,7 @@ static int vec0Rowid(sqlite3_vtab_cursor *cur, sqlite_int64 *pRowid) { static int vec0Next(sqlite3_vtab_cursor *cur) { vec0_cursor *pCur = (vec0_cursor *)cur; switch (pCur->query_plan) { - case SQLITE_VEC0_QUERYPLAN_FULLSCAN: { + case VEC0_QUERY_PLAN_FULLSCAN: { if (!pCur->fullscan_data) { return SQLITE_ERROR; } @@ -5285,7 +5786,7 @@ static int vec0Next(sqlite3_vtab_cursor *cur) { } return SQLITE_ERROR; } - case SQLITE_VEC0_QUERYPLAN_KNN: { + case VEC0_QUERY_PLAN_KNN: { if (!pCur->knn_data) { return SQLITE_ERROR; } @@ -5293,7 +5794,7 @@ static int vec0Next(sqlite3_vtab_cursor *cur) { pCur->knn_data->current_idx++; return SQLITE_OK; } - case SQLITE_VEC0_QUERYPLAN_POINT: { + case VEC0_QUERY_PLAN_POINT: { if (!pCur->point_data) { return SQLITE_ERROR; } @@ -5307,13 +5808,13 @@ static int vec0Next(sqlite3_vtab_cursor *cur) { static int vec0Eof(sqlite3_vtab_cursor *cur) { vec0_cursor *pCur = (vec0_cursor *)cur; switch (pCur->query_plan) { - case SQLITE_VEC0_QUERYPLAN_FULLSCAN: { + case VEC0_QUERY_PLAN_FULLSCAN: { if (!pCur->fullscan_data) { return 1; } return pCur->fullscan_data->done; } - case SQLITE_VEC0_QUERYPLAN_KNN: { + case VEC0_QUERY_PLAN_KNN: { if (!pCur->knn_data) { return 1; } @@ -5321,7 +5822,7 @@ static int vec0Eof(sqlite3_vtab_cursor *cur) { // (pCur->knn_data->distances[pCur->knn_data->current_idx] == FLT_MAX); return (pCur->knn_data->current_idx >= pCur->knn_data->k_used); } - case SQLITE_VEC0_QUERYPLAN_POINT: { + case VEC0_QUERY_PLAN_POINT: { if (!pCur->point_data) { return 1; } @@ -5341,7 +5842,8 @@ static int vec0Column_fullscan(vec0_vtab *pVtab, vec0_cursor *pCur, i64 rowid = sqlite3_column_int64(pCur->fullscan_data->rowids_stmt, 0); if (i == VEC0_COLUMN_ID) { return vec0_result_id(pVtab, context, rowid); - } else if (vec0_column_idx_is_vector(pVtab, i)) { + } + else if (vec0_column_idx_is_vector(pVtab, i)) { void *v; int sz; int vector_idx = vec0_column_idx_to_vector_idx(pVtab, i); @@ -5353,11 +5855,20 @@ static int vec0Column_fullscan(vec0_vtab *pVtab, vec0_cursor *pCur, sqlite3_result_subtype(context, pVtab->vector_columns[vector_idx].element_type); - } else if (i == vec0_column_distance_idx(pVtab)) { - sqlite3_result_null(context); - } else { + } + else if (i == vec0_column_distance_idx(pVtab)) { sqlite3_result_null(context); } + else if(vec0_column_idx_is_partition(pVtab, i)) { + int partition_idx = vec0_column_idx_to_partition_idx(pVtab, i); + sqlite3_value * v; + int rc = vec0_get_partition_value_for_rowid(pVtab, rowid, partition_idx, &v); + if(rc == SQLITE_OK) { + sqlite3_result_value(context, v); + }else { + sqlite3_result_error_code(context, rc); + } + } return SQLITE_OK; } @@ -5371,11 +5882,11 @@ static int vec0Column_point(vec0_vtab *pVtab, vec0_cursor *pCur, if (i == VEC0_COLUMN_ID) { return vec0_result_id(pVtab, context, pCur->point_data->rowid); } - if (i == vec0_column_distance_idx(pVtab)) { + else if (i == vec0_column_distance_idx(pVtab)) { sqlite3_result_null(context); return SQLITE_OK; } - if (vec0_column_idx_is_vector(pVtab, i)) { + else if (vec0_column_idx_is_vector(pVtab, i)) { if (sqlite3_vtab_nochange(context)) { sqlite3_result_null(context); return SQLITE_OK; @@ -5389,6 +5900,20 @@ static int vec0Column_point(vec0_vtab *pVtab, vec0_cursor *pCur, pVtab->vector_columns[vector_idx].element_type); return SQLITE_OK; } + else if(vec0_column_idx_is_partition(pVtab, i)) { + if(sqlite3_vtab_nochange(context)) { + return SQLITE_OK; + } + int partition_idx = vec0_column_idx_to_partition_idx(pVtab, i); + i64 rowid = pCur->point_data->rowid; + sqlite3_value * v; + int rc = vec0_get_partition_value_for_rowid(pVtab, rowid, partition_idx, &v); + if(rc == SQLITE_OK) { + sqlite3_result_value(context, v); + }else { + sqlite3_result_error_code(context, rc); + } + } return SQLITE_OK; } @@ -5404,12 +5929,12 @@ static int vec0Column_knn(vec0_vtab *pVtab, vec0_cursor *pCur, i64 rowid = pCur->knn_data->rowids[pCur->knn_data->current_idx]; return vec0_result_id(pVtab, context, rowid); } - if (i == vec0_column_distance_idx(pVtab)) { + else if (i == vec0_column_distance_idx(pVtab)) { sqlite3_result_double( context, pCur->knn_data->distances[pCur->knn_data->current_idx]); return SQLITE_OK; } - if (vec0_column_idx_is_vector(pVtab, i)) { + else if (vec0_column_idx_is_vector(pVtab, i)) { void *out; int sz; int vector_idx = vec0_column_idx_to_vector_idx(pVtab, i); @@ -5424,6 +5949,17 @@ static int vec0Column_knn(vec0_vtab *pVtab, vec0_cursor *pCur, pVtab->vector_columns[vector_idx].element_type); return SQLITE_OK; } + else if(vec0_column_idx_is_partition(pVtab, i)) { + int partition_idx = vec0_column_idx_to_partition_idx(pVtab, i); + i64 rowid = pCur->knn_data->rowids[pCur->knn_data->current_idx]; + sqlite3_value * v; + int rc = vec0_get_partition_value_for_rowid(pVtab, rowid, partition_idx, &v); + if(rc == SQLITE_OK) { + sqlite3_result_value(context, v); + }else { + sqlite3_result_error_code(context, rc); + } + } return SQLITE_OK; } @@ -5433,13 +5969,13 @@ static int vec0Column(sqlite3_vtab_cursor *cur, sqlite3_context *context, vec0_cursor *pCur = (vec0_cursor *)cur; vec0_vtab *pVtab = (vec0_vtab *)cur->pVtab; switch (pCur->query_plan) { - case SQLITE_VEC0_QUERYPLAN_FULLSCAN: { + case VEC0_QUERY_PLAN_FULLSCAN: { return vec0Column_fullscan(pVtab, pCur, context, i); } - case SQLITE_VEC0_QUERYPLAN_KNN: { + case VEC0_QUERY_PLAN_KNN: { return vec0Column_knn(pVtab, pCur, context, i); } - case SQLITE_VEC0_QUERYPLAN_POINT: { + case VEC0_QUERY_PLAN_POINT: { return vec0Column_point(pVtab, pCur, context, i); } } @@ -5516,6 +6052,8 @@ int vec0Update_InsertRowidStep(vec0_vtab *p, sqlite3_value *idValue, * no more space in previous chunks. * * @param p: virtual table + * @param partitionKeyValues: array of partition key column values, to constrain + * against any partition key columns. * @param chunk_rowid: Output rowid of the chunk in the _chunks virtual table * that has the avialabiity. * @param chunk_offset: Output the index of the available space insert the @@ -5527,7 +6065,9 @@ int vec0Update_InsertRowidStep(vec0_vtab *p, sqlite3_value *idValue, * @return int SQLITE_OK on success, error code on failure */ int vec0Update_InsertNextAvailableStep( - vec0_vtab *p, i64 *chunk_rowid, i64 *chunk_offset, + vec0_vtab *p, + sqlite3_value ** partitionKeyValues, + i64 *chunk_rowid, i64 *chunk_offset, sqlite3_blob **blobChunksValidity, const unsigned char **bufferChunksValidity) { @@ -5535,7 +6075,10 @@ int vec0Update_InsertNextAvailableStep( i64 validitySize; *chunk_offset = -1; - rc = vec0_get_latest_chunk_rowid(p, chunk_rowid); + rc = vec0_get_latest_chunk_rowid(p, chunk_rowid, partitionKeyValues); + if(rc == SQLITE_EMPTY) { + goto done; + } if (rc != SQLITE_OK) { goto cleanup; } @@ -5598,7 +6141,7 @@ int vec0Update_InsertNextAvailableStep( done: // latest chunk was full, so need to create a new one if (*chunk_offset == -1) { - rc = vec0_new_chunk(p, chunk_rowid); + rc = vec0_new_chunk(p, partitionKeyValues, chunk_rowid); if (rc != SQLITE_OK) { // IMP: V08441_25279 vtab_set_error(&p->base, @@ -5852,6 +6395,8 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, // Array to hold cleanup functions for vectorDatas[] vector_cleanup cleanups[VEC0_MAX_VECTOR_COLUMNS]; + sqlite3_value * partitionKeyValues[VEC0_MAX_PARTITION_COLUMNS]; + // Rowid of the chunk in the _chunks shadow table that the row will be a part // of. i64 chunk_rowid; @@ -5865,26 +6410,54 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, const unsigned char *bufferChunksValidity = NULL; int numReadVectors = 0; + // Read all provided partition key values into partitionKeyValues + for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { + if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_PARTITION) { + continue; + } + int partition_key_idx = p->user_column_idxs[i]; + partitionKeyValues[partition_key_idx] = argv[2+VEC0_COLUMN_USERN_START + i]; + + int new_value_type = sqlite3_value_type(partitionKeyValues[partition_key_idx]); + if((new_value_type != SQLITE_NULL) && (new_value_type != p->paritition_columns[partition_key_idx].type)) { + // IMP: V11454_28292 + vtab_set_error( + pVTab, + "Parition key type mismatch: The partition key column %.*s has type %s, but %s was provided.", + p->paritition_columns[partition_key_idx].name_length, + p->paritition_columns[partition_key_idx].name, + type_name(p->paritition_columns[partition_key_idx].type), + type_name(new_value_type) + ); + rc = SQLITE_ERROR; + goto cleanup; + } + } + // read all the inserted vectors into vectorDatas, validate their lengths. - for (int i = 0; i < p->numVectorColumns; i++) { - sqlite3_value *valueVector = argv[2 + VEC0_COLUMN_VECTORN_START + i]; + for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { + if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_VECTOR) { + continue; + } + int vector_column_idx = p->user_column_idxs[i]; + sqlite3_value *valueVector = argv[2 + VEC0_COLUMN_USERN_START + i]; size_t dimensions; char *pzError; enum VectorElementType elementType; - rc = vector_from_value(valueVector, &vectorDatas[i], &dimensions, - &elementType, &cleanups[i], &pzError); + rc = vector_from_value(valueVector, &vectorDatas[vector_column_idx], &dimensions, + &elementType, &cleanups[vector_column_idx], &pzError); if (rc != SQLITE_OK) { // IMP: V06519_23358 vtab_set_error( pVTab, "Inserted vector for the \"%.*s\" column is invalid: %z", - p->vector_columns[i].name_length, p->vector_columns[i].name, pzError); + p->vector_columns[vector_column_idx].name_length, p->vector_columns[vector_column_idx].name, pzError); rc = SQLITE_ERROR; goto cleanup; } numReadVectors++; - if (elementType != p->vector_columns[i].element_type) { + if (elementType != p->vector_columns[vector_column_idx].element_type) { // IMP: V08221_25059 vtab_set_error( pVTab, @@ -5897,14 +6470,14 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, goto cleanup; } - if (dimensions != p->vector_columns[i].dimensions) { + if (dimensions != p->vector_columns[vector_column_idx].dimensions) { // IMP: V01145_17984 vtab_set_error( pVTab, "Dimension mismatch for inserted vector for the \"%.*s\" column. " "Expected %d dimensions but received %d.", - p->vector_columns[i].name_length, p->vector_columns[i].name, - p->vector_columns[i].dimensions, dimensions); + p->vector_columns[vector_column_idx].name_length, p->vector_columns[vector_column_idx].name, + p->vector_columns[vector_column_idx].dimensions, dimensions); rc = SQLITE_ERROR; goto cleanup; } @@ -5935,7 +6508,8 @@ int vec0Update_Insert(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv, // Step #2: Find the next "available" position in the _chunks table for this // row. - rc = vec0Update_InsertNextAvailableStep(p, &chunk_rowid, &chunk_offset, + rc = vec0Update_InsertNextAvailableStep(p, partitionKeyValues, + &chunk_rowid, &chunk_offset, &blobChunksValidity, &bufferChunksValidity); if (rc != SQLITE_OK) { @@ -6212,16 +6786,33 @@ int vec0Update_Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv) { rowid = sqlite3_value_int64(argv[0]); } - // 1. get chunk_id and chunk_offset from _rowids + // 1) get chunk_id and chunk_offset from _rowids rc = vec0_get_chunk_position(p, rowid, NULL, &chunk_id, &chunk_offset); if (rc != SQLITE_OK) { return rc; } - // 2) iterate over all new vectors, update the vectors + // 2) update any partition key values + for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { + if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_PARTITION) { + continue; + } + int partition_key_idx = p->user_column_idxs[i]; + sqlite3_value * value = argv[2+VEC0_COLUMN_USERN_START + i]; + if(sqlite3_value_nochange(value)) { + continue; + } + vtab_set_error(pVTab, "UPDATE on partition key columns are not supported yet. "); + return SQLITE_ERROR; + } - for (int i = 0; i < p->numVectorColumns; i++) { - sqlite3_value *valueVector = argv[2 + VEC0_COLUMN_VECTORN_START + i]; + // 3) iterate over all new vectors, update the vectors + for (int i = 0; i < vec0_num_defined_user_columns(p); i++) { + if(p->user_column_kinds[i] != SQLITE_VEC0_USER_COLUMN_KIND_VECTOR) { + continue; + } + int vector_idx = p->user_column_idxs[i]; + sqlite3_value *valueVector = argv[2 + VEC0_COLUMN_USERN_START + i]; // in vec0Column, we check sqlite3_vtab_nochange() on vector columns. // If the vector column isn't being changed, we return NULL; // That's not great, that means vector columns can never be NULLABLE @@ -6236,7 +6827,7 @@ int vec0Update_Update(sqlite3_vtab *pVTab, int argc, sqlite3_value **argv) { continue; } - rc = vec0Update_UpdateVectorColumn(p, chunk_id, chunk_offset, i, + rc = vec0Update_UpdateVectorColumn(p, chunk_id, chunk_offset, vector_idx, valueVector); if (rc != SQLITE_OK) { return SQLITE_ERROR; diff --git a/test.sql b/test.sql new file mode 100644 index 0000000..7434207 --- /dev/null +++ b/test.sql @@ -0,0 +1,49 @@ +.load dist/vec0 +.echo on +.bail on + +.mode qbox + +create virtual table v using vec0(a float[1]); +select count(*) from v_chunks; +insert into v(a) values ('[1.11]'); +select * from v; +drop table v; + +create virtual table v using vec0( + + v_aaa float[1], + partk_xxx int partition key, + v_bbb float[2], + partk_yyy text partition key, + chunk_size=32 +); + + +insert into v(rowid, v_aaa, partk_xxx, v_bbb, partk_yyy) values + (1, '[.1]', 999, '[.11, .11]', 'alex'), + (2, '[.2]', 999, '[.22, .22]', 'alex'), + (3, '[.3]', 999, '[.33, .33]', 'brian'); + + +select rowid, vec_to_json(v_aaa), partk_xxx, vec_to_json(v_bbb), partk_yyy from v; + +select * from v; +select * from v where rowid = 2; +update v +set v_aaa = '[.222]', + v_bbb = '[.222, .222]' +where rowid = 2; + +select rowid, vec_to_json(v_aaa), partk_xxx, vec_to_json(v_bbb), partk_yyy from v; + +select chunk_id, size, sequence_id, partition00, partition01, (validity), length(rowids) from v_chunks; + +--explain query plan +select *, distance +from v +where v_aaa match '[.5]' + and partk_xxx = 999 + and partk_yyy = 'alex' + --and partk_xxx != 20 + and k = 5; diff --git a/tests/.python-version b/tests/.python-version new file mode 100644 index 0000000..e4fba21 --- /dev/null +++ b/tests/.python-version @@ -0,0 +1 @@ +3.12 diff --git a/tests/__snapshots__/test-partition-keys.ambr b/tests/__snapshots__/test-partition-keys.ambr new file mode 100644 index 0000000..a9dca88 --- /dev/null +++ b/tests/__snapshots__/test-partition-keys.ambr @@ -0,0 +1,245 @@ +# serializer version: 1 +# name: test_constructor_limit[max 4 partition keys] + dict({ + 'error': 'OperationalError', + 'message': 'vec0 constructor error: More than 4 partition key columns were provided', + }) +# --- +# name: test_normal[1 row] + dict({ + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + OrderedDict({ + 'chunk_id': 1, + 'size': 8, + 'sequence_id': None, + 'partition00': 100, + 'validity': b'\x01', + 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 0, + }), + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vectors': b'\x11"3D\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + }) +# --- +# name: test_normal[2 rows, same parition] + dict({ + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + OrderedDict({ + 'chunk_id': 1, + 'size': 8, + 'sequence_id': None, + 'partition00': 100, + 'validity': b'\x03', + 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 0, + }), + OrderedDict({ + 'rowid': 2, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 1, + }), + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vectors': b'\x11"3DDUfw\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + }) +# --- +# name: test_normal[3 rows, 2 partitions] + dict({ + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + OrderedDict({ + 'chunk_id': 1, + 'size': 8, + 'sequence_id': None, + 'partition00': 100, + 'validity': b'\x03', + 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + OrderedDict({ + 'chunk_id': 2, + 'size': 8, + 'sequence_id': None, + 'partition00': 200, + 'validity': b'\x01', + 'rowids': b'\x03\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 0, + }), + OrderedDict({ + 'rowid': 2, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 1, + }), + OrderedDict({ + 'rowid': 3, + 'id': None, + 'chunk_id': 2, + 'chunk_offset': 0, + }), + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vectors': b'\x11"3DDUfw\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + OrderedDict({ + 'rowid': 2, + 'vectors': b'\x88\x99\xaa\xbb\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + }) +# --- +# name: test_types[1. raises type error] + dict({ + 'error': 'OperationalError', + 'message': 'Parition key type mismatch: The partition key column p1 has type INTEGER, but TEXT was provided.', + }) +# --- +# name: test_types[2. empty DB] + dict({ + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + ]), + }), + }) +# --- +# name: test_types[3. allow nulls] + OrderedDict({ + 'sql': 'insert into v(p1, a) values(?, ?)', + 'rows': list([ + ]), + }) +# --- +# name: test_types[4. show NULL partition key] + dict({ + 'v_chunks': OrderedDict({ + 'sql': 'select * from v_chunks', + 'rows': list([ + OrderedDict({ + 'chunk_id': 1, + 'size': 8, + 'sequence_id': None, + 'partition00': None, + 'validity': b'\x01', + 'rowids': b'\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + 'v_rowids': OrderedDict({ + 'sql': 'select * from v_rowids', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'id': None, + 'chunk_id': 1, + 'chunk_offset': 0, + }), + ]), + }), + 'v_vector_chunks00': OrderedDict({ + 'sql': 'select * from v_vector_chunks00', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'vectors': b'\x11"3D\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', + }), + ]), + }), + }) +# --- +# name: test_updates[1. Initial dataset] + OrderedDict({ + 'sql': 'select * from v', + 'rows': list([ + OrderedDict({ + 'rowid': 1, + 'p': 'a', + 'a': b'\x11\x11\x11\x11', + }), + OrderedDict({ + 'rowid': 2, + 'p': 'a', + 'a': b'""""', + }), + OrderedDict({ + 'rowid': 3, + 'p': 'a', + 'a': b'3333', + }), + ]), + }) +# --- +# name: test_updates[2. update #1] + dict({ + 'error': 'OperationalError', + 'message': 'UPDATE on partition key columns are not supported yet. ', + }) +# --- diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..9549d37 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,12 @@ +import pytest +import sqlite3 + + +@pytest.fixture() +def db(): + db = sqlite3.connect(":memory:") + db.row_factory = sqlite3.Row + db.enable_load_extension(True) + db.load_extension("dist/vec0") + db.enable_load_extension(False) + return db diff --git a/tests/pyproject.toml b/tests/pyproject.toml new file mode 100644 index 0000000..15c42c9 --- /dev/null +++ b/tests/pyproject.toml @@ -0,0 +1,9 @@ +[project] +name = "tests" +version = "0.1.0" +description = "Add your description here" +readme = "README.md" +requires-python = ">=3.12" +dependencies = [ + "pytest", "numpy", "syrupy" +] diff --git a/tests/test-correctness.py b/tests/skip.test-correctness.py similarity index 100% rename from tests/test-correctness.py rename to tests/skip.test-correctness.py diff --git a/tests/test-loadable.py b/tests/test-loadable.py index b1976cb..30171fe 100644 --- a/tests/test-loadable.py +++ b/tests/test-loadable.py @@ -81,7 +81,7 @@ def connect(ext, path=":memory:", extra_entrypoint=None): db = connect(EXT_PATH) -def explain_query_plan(sql): +def explain_query_plan(sql, db=db): return db.execute("explain query plan " + sql).fetchone()["detail"] @@ -1497,6 +1497,13 @@ def test_vec0_text_pk(): ] if SUPPORTS_VTAB_IN: + assert re.match( + ("SCAN (TABLE )?t VIRTUAL TABLE INDEX 0:3{___}___\[___"), + explain_query_plan( + "select t_id, distance from t where aaa match '' and k = 3 and t_id in ('t_2', 't_3')", + db=db, + ), + ) assert execute_all( db, "select t_id, distance from t where aaa match ? and k = 3 and t_id in ('t_2', 't_3')", @@ -1939,20 +1946,6 @@ def test_vec0_create_errors(): db.execute("create virtual table t1 using vec0(a float[1])") db.set_authorizer(None) - db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_INSERT, "t1_chunks")) - with _raises( - "Could not create create an initial chunk", - ): - db.execute("create virtual table t1 using vec0(a float[1])") - db.set_authorizer(None) - - db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_INSERT, "t1_vector_chunks00")) - with _raises( - "Could not create create an initial chunk", - ): - db.execute("create virtual table t1 using vec0(a float[1])") - db.set_authorizer(None) - # EVIDENCE-OF: V21406_05476 vec0 init raises error on 'latest chunk' init error db.execute("BEGIN") db.set_authorizer(authorizer_deny_on(sqlite3.SQLITE_READ, "t1_chunks", "")) @@ -2231,32 +2224,34 @@ def test_smoke(): }, ] chunk = db.execute("select * from vec_xyz_chunks").fetchone() - assert chunk["chunk_id"] == 1 - assert chunk["validity"] == bytearray(int(1024 / 8)) - assert chunk["rowids"] == bytearray(int(1024 * 8)) - vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone() - assert vchunk["rowid"] == 1 - assert vchunk["vectors"] == bytearray(int(1024 * 4 * 2)) + # as of TODO, no initial row is inside the chunks table + assert chunk is None + # assert chunk["chunk_id"] == 1 + # assert chunk["validity"] == bytearray(int(1024 / 8)) + # assert chunk["rowids"] == bytearray(int(1024 * 8)) + # vchunk = db.execute("select * from vec_xyz_vector_chunks00").fetchone() + # assert vchunk["rowid"] == 1 + # assert vchunk["vectors"] == bytearray(int(1024 * 4 * 2)) assert re.match( - "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:knn:", + "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:3{___}___", explain_query_plan( "select * from vec_xyz where a match X'' and k = 10 order by distance" ), ) if SUPPORTS_VTAB_LIMIT: assert re.match( - "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:knn:", + "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:3{___}___", explain_query_plan( "select * from vec_xyz where a match X'' order by distance limit 10" ), ) assert re.match( - "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:fullscan", + "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 0:1", explain_query_plan("select * from vec_xyz"), ) assert re.match( - "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 3:point", + "SCAN (TABLE )?vec_xyz VIRTUAL TABLE INDEX 3:2", explain_query_plan("select * from vec_xyz where rowid = 4"), ) diff --git a/tests/test-partition-keys.py b/tests/test-partition-keys.py new file mode 100644 index 0000000..41c7671 --- /dev/null +++ b/tests/test-partition-keys.py @@ -0,0 +1,115 @@ +import sqlite3 +from collections import OrderedDict + + +def test_constructor_limit(db, snapshot): + assert exec( + db, + """ + create virtual table v using vec0( + p1 int partition key, + p2 int partition key, + p3 int partition key, + p4 int partition key, + p5 int partition key, + v float[1] + ) + """, + ) == snapshot(name="max 4 partition keys") + + +def test_normal(db, snapshot): + db.execute( + "create virtual table v using vec0(p1 int partition key, a float[1], chunk_size=8)" + ) + + db.execute("insert into v(rowid, p1, a) values (1, 100, X'11223344')") + assert vec0_shadow_table_contents(db, "v") == snapshot(name="1 row") + db.execute("insert into v(rowid, p1, a) values (2, 100, X'44556677')") + assert vec0_shadow_table_contents(db, "v") == snapshot(name="2 rows, same parition") + db.execute("insert into v(rowid, p1, a) values (3, 200, X'8899aabb')") + assert vec0_shadow_table_contents(db, "v") == snapshot(name="3 rows, 2 partitions") + + +def test_types(db, snapshot): + db.execute( + "create virtual table v using vec0(p1 int partition key, a float[1], chunk_size=8)" + ) + + # EVIDENCE-OF: V11454_28292 + assert exec( + db, "insert into v(p1, a) values(?, ?)", ["not int", b"\x11\x22\x33\x44"] + ) == snapshot(name="1. raises type error") + + assert vec0_shadow_table_contents(db, "v") == snapshot(name="2. empty DB") + + # but allow NULLs + assert exec( + db, "insert into v(p1, a) values(?, ?)", [None, b"\x11\x22\x33\x44"] + ) == snapshot(name="3. allow nulls") + + assert vec0_shadow_table_contents(db, "v") == snapshot( + name="4. show NULL partition key" + ) + + +def test_updates(db, snapshot): + db.execute( + "create virtual table v using vec0(p text partition key, a float[1], chunk_size=8)" + ) + + db.execute( + "insert into v(rowid, p, a) values (?, ?, ?)", [1, "a", b"\x11\x11\x11\x11"] + ) + db.execute( + "insert into v(rowid, p, a) values (?, ?, ?)", [2, "a", b"\x22\x22\x22\x22"] + ) + db.execute( + "insert into v(rowid, p, a) values (?, ?, ?)", [3, "a", b"\x33\x33\x33\x33"] + ) + + assert exec(db, "select * from v") == snapshot(name="1. Initial dataset") + assert exec(db, "update v set p = ? where rowid = ?", ["new", 1]) == snapshot( + name="2. update #1" + ) + + +class Row: + def __init__(self): + pass + + def __repr__(self) -> str: + return repr() + + +def exec(db, sql, parameters=[]): + try: + rows = db.execute(sql, parameters).fetchall() + except (sqlite3.OperationalError, sqlite3.DatabaseError) as e: + return { + "error": e.__class__.__name__, + "message": str(e), + } + a = [] + for row in rows: + o = OrderedDict() + for k in row.keys(): + o[k] = row[k] + a.append(o) + result = OrderedDict() + result["sql"] = sql + result["rows"] = a + return result + + +def vec0_shadow_table_contents(db, v): + shadow_tables = [ + row[0] + for row in db.execute( + "select name from sqlite_master where name like ? order by 1", [f"{v}_%"] + ).fetchall() + ] + o = {} + for shadow_table in shadow_tables: + o[shadow_table] = exec(db, f"select * from {shadow_table}") + return o diff --git a/tests/test-unit.c b/tests/test-unit.c new file mode 100644 index 0000000..d9a1211 --- /dev/null +++ b/tests/test-unit.c @@ -0,0 +1,54 @@ +#include "../sqlite-vec.h" +#include +#include +#include + +#define countof(x) (sizeof(x) / sizeof((x)[0])) + +void test_vec0_parse_partition_key_definition() { + printf("Starting %s...\n", __func__); + typedef struct { + char * test; + int expected_rc; + const char *expected_column_name; + int expected_column_type; + } TestCase; + + TestCase suite[] = { + {"user_id integer partition key", SQLITE_OK, "user_id", SQLITE_INTEGER}, + {"USER_id int partition key", SQLITE_OK, "USER_id", SQLITE_INTEGER}, + {"category text partition key", SQLITE_OK, "category", SQLITE_TEXT}, + + {"", SQLITE_EMPTY, "", 0}, + {"document_id text primary key", SQLITE_EMPTY, "", 0}, + {"document_id text partition keyy", SQLITE_EMPTY, "", 0}, + }; + for(int i = 0; i < countof(suite); i++) { + char * out_column_name; + int out_column_name_length; + int out_column_type; + int rc; + rc = vec0_parse_partition_key_definition( + suite[i].test, + strlen(suite[i].test), + &out_column_name, + &out_column_name_length, + &out_column_type + ); + printf("2\n"); + assert(rc == suite[i].expected_rc); + + if(rc == SQLITE_OK) { + assert(out_column_name_length == strlen(suite[i].expected_column_name)); + assert(strncmp(out_column_name, suite[i].expected_column_name, out_column_name_length) == 0); + assert(out_column_type == suite[i].expected_column_type); + } + + printf("✅ %s\n", suite[i].test); + } +} + +int main() { + printf("Starting unit tests...\n"); + test_vec0_parse_partition_key_definition(); +} diff --git a/tests/uv.lock b/tests/uv.lock new file mode 100644 index 0000000..7e3ee4b --- /dev/null +++ b/tests/uv.lock @@ -0,0 +1,120 @@ +version = 1 +requires-python = ">=3.12" + +[[package]] +name = "colorama" +version = "0.4.6" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, +] + +[[package]] +name = "iniconfig" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d7/4b/cbd8e699e64a6f16ca3a8220661b5f83792b3017d0f79807cb8708d33913/iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3", size = 4646 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ef/a6/62565a6e1cf69e10f5727360368e451d4b7f58beeac6173dc9db836a5b46/iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374", size = 5892 }, +] + +[[package]] +name = "numpy" +version = "2.1.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/25/ca/1166b75c21abd1da445b97bf1fa2f14f423c6cfb4fc7c4ef31dccf9f6a94/numpy-2.1.3.tar.gz", hash = "sha256:aa08e04e08aaf974d4458def539dece0d28146d866a39da5639596f4921fd761", size = 20166090 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8a/f0/385eb9970309643cbca4fc6eebc8bb16e560de129c91258dfaa18498da8b/numpy-2.1.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f55ba01150f52b1027829b50d70ef1dafd9821ea82905b63936668403c3b471e", size = 20849658 }, + { url = "https://files.pythonhosted.org/packages/54/4a/765b4607f0fecbb239638d610d04ec0a0ded9b4951c56dc68cef79026abf/numpy-2.1.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:13138eadd4f4da03074851a698ffa7e405f41a0845a6b1ad135b81596e4e9958", size = 13492258 }, + { url = "https://files.pythonhosted.org/packages/bd/a7/2332679479c70b68dccbf4a8eb9c9b5ee383164b161bee9284ac141fbd33/numpy-2.1.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:a6b46587b14b888e95e4a24d7b13ae91fa22386c199ee7b418f449032b2fa3b8", size = 5090249 }, + { url = "https://files.pythonhosted.org/packages/c1/67/4aa00316b3b981a822c7a239d3a8135be2a6945d1fd11d0efb25d361711a/numpy-2.1.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:0fa14563cc46422e99daef53d725d0c326e99e468a9320a240affffe87852564", size = 6621704 }, + { url = "https://files.pythonhosted.org/packages/5e/da/1a429ae58b3b6c364eeec93bf044c532f2ff7b48a52e41050896cf15d5b1/numpy-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8637dcd2caa676e475503d1f8fdb327bc495554e10838019651b76d17b98e512", size = 13606089 }, + { url = "https://files.pythonhosted.org/packages/9e/3e/3757f304c704f2f0294a6b8340fcf2be244038be07da4cccf390fa678a9f/numpy-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2312b2aa89e1f43ecea6da6ea9a810d06aae08321609d8dc0d0eda6d946a541b", size = 16043185 }, + { url = "https://files.pythonhosted.org/packages/43/97/75329c28fea3113d00c8d2daf9bc5828d58d78ed661d8e05e234f86f0f6d/numpy-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:a38c19106902bb19351b83802531fea19dee18e5b37b36454f27f11ff956f7fc", size = 16410751 }, + { url = "https://files.pythonhosted.org/packages/ad/7a/442965e98b34e0ae9da319f075b387bcb9a1e0658276cc63adb8c9686f7b/numpy-2.1.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:02135ade8b8a84011cbb67dc44e07c58f28575cf9ecf8ab304e51c05528c19f0", size = 14082705 }, + { url = "https://files.pythonhosted.org/packages/ac/b6/26108cf2cfa5c7e03fb969b595c93131eab4a399762b51ce9ebec2332e80/numpy-2.1.3-cp312-cp312-win32.whl", hash = "sha256:e6988e90fcf617da2b5c78902fe8e668361b43b4fe26dbf2d7b0f8034d4cafb9", size = 6239077 }, + { url = "https://files.pythonhosted.org/packages/a6/84/fa11dad3404b7634aaab50733581ce11e5350383311ea7a7010f464c0170/numpy-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:0d30c543f02e84e92c4b1f415b7c6b5326cbe45ee7882b6b77db7195fb971e3a", size = 12566858 }, + { url = "https://files.pythonhosted.org/packages/4d/0b/620591441457e25f3404c8057eb924d04f161244cb8a3680d529419aa86e/numpy-2.1.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:96fe52fcdb9345b7cd82ecd34547fca4321f7656d500eca497eb7ea5a926692f", size = 20836263 }, + { url = "https://files.pythonhosted.org/packages/45/e1/210b2d8b31ce9119145433e6ea78046e30771de3fe353f313b2778142f34/numpy-2.1.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:f653490b33e9c3a4c1c01d41bc2aef08f9475af51146e4a7710c450cf9761598", size = 13507771 }, + { url = "https://files.pythonhosted.org/packages/55/44/aa9ee3caee02fa5a45f2c3b95cafe59c44e4b278fbbf895a93e88b308555/numpy-2.1.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:dc258a761a16daa791081d026f0ed4399b582712e6fc887a95af09df10c5ca57", size = 5075805 }, + { url = "https://files.pythonhosted.org/packages/78/d6/61de6e7e31915ba4d87bbe1ae859e83e6582ea14c6add07c8f7eefd8488f/numpy-2.1.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:016d0f6f5e77b0f0d45d77387ffa4bb89816b57c835580c3ce8e099ef830befe", size = 6608380 }, + { url = "https://files.pythonhosted.org/packages/3e/46/48bdf9b7241e317e6cf94276fe11ba673c06d1fdf115d8b4ebf616affd1a/numpy-2.1.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c181ba05ce8299c7aa3125c27b9c2167bca4a4445b7ce73d5febc411ca692e43", size = 13602451 }, + { url = "https://files.pythonhosted.org/packages/70/50/73f9a5aa0810cdccda9c1d20be3cbe4a4d6ea6bfd6931464a44c95eef731/numpy-2.1.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5641516794ca9e5f8a4d17bb45446998c6554704d888f86df9b200e66bdcce56", size = 16039822 }, + { url = "https://files.pythonhosted.org/packages/ad/cd/098bc1d5a5bc5307cfc65ee9369d0ca658ed88fbd7307b0d49fab6ca5fa5/numpy-2.1.3-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:ea4dedd6e394a9c180b33c2c872b92f7ce0f8e7ad93e9585312b0c5a04777a4a", size = 16411822 }, + { url = "https://files.pythonhosted.org/packages/83/a2/7d4467a2a6d984549053b37945620209e702cf96a8bc658bc04bba13c9e2/numpy-2.1.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:b0df3635b9c8ef48bd3be5f862cf71b0a4716fa0e702155c45067c6b711ddcef", size = 14079598 }, + { url = "https://files.pythonhosted.org/packages/e9/6a/d64514dcecb2ee70bfdfad10c42b76cab657e7ee31944ff7a600f141d9e9/numpy-2.1.3-cp313-cp313-win32.whl", hash = "sha256:50ca6aba6e163363f132b5c101ba078b8cbd3fa92c7865fd7d4d62d9779ac29f", size = 6236021 }, + { url = "https://files.pythonhosted.org/packages/bb/f9/12297ed8d8301a401e7d8eb6b418d32547f1d700ed3c038d325a605421a4/numpy-2.1.3-cp313-cp313-win_amd64.whl", hash = "sha256:747641635d3d44bcb380d950679462fae44f54b131be347d5ec2bce47d3df9ed", size = 12560405 }, + { url = "https://files.pythonhosted.org/packages/a7/45/7f9244cd792e163b334e3a7f02dff1239d2890b6f37ebf9e82cbe17debc0/numpy-2.1.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:996bb9399059c5b82f76b53ff8bb686069c05acc94656bb259b1d63d04a9506f", size = 20859062 }, + { url = "https://files.pythonhosted.org/packages/b1/b4/a084218e7e92b506d634105b13e27a3a6645312b93e1c699cc9025adb0e1/numpy-2.1.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:45966d859916ad02b779706bb43b954281db43e185015df6eb3323120188f9e4", size = 13515839 }, + { url = "https://files.pythonhosted.org/packages/27/45/58ed3f88028dcf80e6ea580311dc3edefdd94248f5770deb980500ef85dd/numpy-2.1.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:baed7e8d7481bfe0874b566850cb0b85243e982388b7b23348c6db2ee2b2ae8e", size = 5116031 }, + { url = "https://files.pythonhosted.org/packages/37/a8/eb689432eb977d83229094b58b0f53249d2209742f7de529c49d61a124a0/numpy-2.1.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:a9f7f672a3388133335589cfca93ed468509cb7b93ba3105fce780d04a6576a0", size = 6629977 }, + { url = "https://files.pythonhosted.org/packages/42/a3/5355ad51ac73c23334c7caaed01adadfda49544f646fcbfbb4331deb267b/numpy-2.1.3-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7aac50327da5d208db2eec22eb11e491e3fe13d22653dce51b0f4109101b408", size = 13575951 }, + { url = "https://files.pythonhosted.org/packages/c4/70/ea9646d203104e647988cb7d7279f135257a6b7e3354ea6c56f8bafdb095/numpy-2.1.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4394bc0dbd074b7f9b52024832d16e019decebf86caf909d94f6b3f77a8ee3b6", size = 16022655 }, + { url = "https://files.pythonhosted.org/packages/14/ce/7fc0612903e91ff9d0b3f2eda4e18ef9904814afcae5b0f08edb7f637883/numpy-2.1.3-cp313-cp313t-musllinux_1_1_x86_64.whl", hash = "sha256:50d18c4358a0a8a53f12a8ba9d772ab2d460321e6a93d6064fc22443d189853f", size = 16399902 }, + { url = "https://files.pythonhosted.org/packages/ef/62/1d3204313357591c913c32132a28f09a26357e33ea3c4e2fe81269e0dca1/numpy-2.1.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:14e253bd43fc6b37af4921b10f6add6925878a42a0c5fe83daee390bca80bc17", size = 14067180 }, + { url = "https://files.pythonhosted.org/packages/24/d7/78a40ed1d80e23a774cb8a34ae8a9493ba1b4271dde96e56ccdbab1620ef/numpy-2.1.3-cp313-cp313t-win32.whl", hash = "sha256:08788d27a5fd867a663f6fc753fd7c3ad7e92747efc73c53bca2f19f8bc06f48", size = 6291907 }, + { url = "https://files.pythonhosted.org/packages/86/09/a5ab407bd7f5f5599e6a9261f964ace03a73e7c6928de906981c31c38082/numpy-2.1.3-cp313-cp313t-win_amd64.whl", hash = "sha256:2564fbdf2b99b3f815f2107c1bbc93e2de8ee655a69c261363a1172a79a257d4", size = 12644098 }, +] + +[[package]] +name = "packaging" +version = "24.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d0/63/68dbb6eb2de9cb10ee4c9c14a0148804425e13c4fb20d61cce69f53106da/packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f", size = 163950 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/ef/eb23f262cca3c0c4eb7ab1933c3b1f03d021f2c48f54763065b6f0e321be/packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759", size = 65451 }, +] + +[[package]] +name = "pluggy" +version = "1.5.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/96/2d/02d4312c973c6050a18b314a5ad0b3210edb65a906f868e31c111dede4a6/pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1", size = 67955 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/88/5f/e351af9a41f866ac3f1fac4ca0613908d9a41741cfcf2228f4ad853b697d/pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669", size = 20556 }, +] + +[[package]] +name = "pytest" +version = "8.3.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, + { name = "iniconfig" }, + { name = "packaging" }, + { name = "pluggy" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8b/6c/62bbd536103af674e227c41a8f3dcd022d591f6eed5facb5a0f31ee33bbc/pytest-8.3.3.tar.gz", hash = "sha256:70b98107bd648308a7952b06e6ca9a50bc660be218d53c257cc1fc94fda10181", size = 1442487 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/6b/77/7440a06a8ead44c7757a64362dd22df5760f9b12dc5f11b6188cd2fc27a0/pytest-8.3.3-py3-none-any.whl", hash = "sha256:a6853c7375b2663155079443d2e45de913a911a11d669df02a50814944db57b2", size = 342341 }, +] + +[[package]] +name = "syrupy" +version = "4.7.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pytest" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/67/81/f46d234fa4ca0edcdeed973bab9acd8f8ac186537cdc850e9e84a00f61a0/syrupy-4.7.2.tar.gz", hash = "sha256:ea45e099f242de1bb53018c238f408a5bb6c82007bc687aefcbeaa0e1c2e935a", size = 49320 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/b9/75/57b629fdd256efc58fb045618d603ce0b0f5fcc477f34b758e34423efb99/syrupy-4.7.2-py3-none-any.whl", hash = "sha256:eae7ba6be5aed190237caa93be288e97ca1eec5ca58760e4818972a10c4acc64", size = 49234 }, +] + +[[package]] +name = "tests" +version = "0.1.0" +source = { virtual = "." } +dependencies = [ + { name = "numpy" }, + { name = "pytest" }, + { name = "syrupy" }, +] + +[package.metadata] +requires-dist = [ + { name = "numpy" }, + { name = "pytest" }, + { name = "syrupy" }, +]