Skip to content

Commit 48cb8a0

Browse files
refactor: simplify DuckDB store to native JSON storage only
- Remove native_storage parameter from DuckDBSerializationAdapter and DuckDBStore - Remove value_json TEXT column from schema, use only value_dict JSON column - Update all SQL statements to work with single value_dict column - Remove TestDuckDBStoreTextMode test class - Remove test_text_mode_storage() and test_native_vs_stringified_storage() tests - Update test_native_sql_queryability() to use correct JSON path for nested value - Regenerate sync library with simplified implementation This simplification focuses on native JSON storage for better queryability and removes the complexity of supporting dual storage modes. Co-authored-by: William Easton <strawgate@users.noreply.github.com>
1 parent 2a2094f commit 48cb8a0

File tree

4 files changed

+83
-337
lines changed

4 files changed

+83
-337
lines changed

key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py

Lines changed: 39 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -18,31 +18,22 @@
1818

1919

2020
class DuckDBSerializationAdapter(SerializationAdapter):
21-
"""Adapter for DuckDB with support for native JSON and TEXT storage modes."""
21+
"""Adapter for DuckDB with native JSON storage."""
2222

23-
_native_storage: bool
24-
25-
def __init__(self, *, native_storage: bool = True) -> None:
26-
"""Initialize the DuckDB adapter.
27-
28-
Args:
29-
native_storage: If True, use JSON column for native dict storage.
30-
If False, use TEXT column for stringified JSON.
31-
"""
23+
def __init__(self) -> None:
24+
"""Initialize the DuckDB adapter."""
3225
super().__init__()
3326

34-
self._native_storage = native_storage
3527
self._date_format = "datetime"
36-
# Always use string format - DuckDB needs JSON strings for both TEXT and JSON columns
28+
# Use string format - DuckDB needs JSON strings for JSON columns
3729
self._value_format = "string"
3830

3931
@override
4032
def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]:
4133
"""Prepare data for dumping to DuckDB.
4234
43-
Moves the value to the appropriate column (value_dict or value_json)
44-
and sets the other column to None. Also includes version, key, and collection
45-
fields in the JSON for compatibility with deserialization.
35+
Stores the value in the value_dict JSON column and includes version, key,
36+
and collection fields in the JSON for compatibility with deserialization.
4637
"""
4738
value = data.pop("value")
4839

@@ -51,7 +42,7 @@ def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]:
5142
key = data.pop("key", None)
5243
collection_name = data.pop("collection", None)
5344

54-
# Build the document to store in JSON columns
45+
# Build the document to store in JSON column
5546
json_document: dict[str, Any] = {"value": value}
5647

5748
if version is not None:
@@ -61,32 +52,23 @@ def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]:
6152
if collection_name is not None:
6253
json_document["collection"] = collection_name
6354

64-
# Set both columns to None, then populate the appropriate one
65-
data["value_json"] = None
66-
data["value_dict"] = None
67-
68-
if self._native_storage:
69-
# For native storage, convert the document to JSON string for DuckDB's JSON column
70-
# DuckDB will parse it and store it as native JSON
71-
data["value_dict"] = json.dumps(json_document)
72-
else:
73-
# For TEXT storage, store as JSON string
74-
data["value_json"] = json.dumps(json_document)
55+
# Store as JSON string for DuckDB's JSON column
56+
# DuckDB will parse it and store it as native JSON
57+
data["value_dict"] = json.dumps(json_document)
7558

7659
return data
7760

7861
@override
7962
def prepare_load(self, data: dict[str, Any]) -> dict[str, Any]:
8063
"""Prepare data loaded from DuckDB for conversion to ManagedEntry.
8164
82-
Extracts value, version, key, and collection from the JSON columns
65+
Extracts value, version, key, and collection from the JSON column
8366
and handles timezone conversion for DuckDB's naive timestamps.
8467
"""
85-
value_json = data.pop("value_json", None)
8668
value_dict = data.pop("value_dict", None)
8769

88-
# Parse the JSON document from the appropriate column
89-
json_document = self._parse_json_column(value_dict, value_json)
70+
# Parse the JSON document from the value_dict column
71+
json_document = self._parse_json_column(value_dict)
9072

9173
# Extract fields from the JSON document
9274
data["value"] = json_document.get("value")
@@ -102,27 +84,20 @@ def prepare_load(self, data: dict[str, Any]) -> dict[str, Any]:
10284

10385
return data
10486

105-
def _parse_json_column(self, value_dict: Any, value_json: Any) -> dict[str, Any]: # noqa: ANN401
106-
"""Parse JSON from value_dict or value_json column."""
107-
if value_dict is not None:
108-
# Native storage mode - value_dict can be dict or string (DuckDB JSON returns as string)
109-
if isinstance(value_dict, dict):
110-
return cast(dict[str, Any], value_dict)
111-
if isinstance(value_dict, str):
112-
parsed: dict[str, Any] = json.loads(value_dict)
113-
return parsed
114-
msg = f"value_dict has unexpected type: {type(value_dict)}"
87+
def _parse_json_column(self, value_dict: Any) -> dict[str, Any]:
88+
"""Parse JSON from value_dict column."""
89+
if value_dict is None:
90+
msg = "value_dict column contains no data"
11591
raise DeserializationError(message=msg)
11692

117-
if value_json is not None:
118-
# Stringified JSON mode - parse from string
119-
if isinstance(value_json, str):
120-
parsed_json: dict[str, Any] = json.loads(value_json)
121-
return parsed_json
122-
msg = f"value_json has unexpected type: {type(value_json)}"
123-
raise DeserializationError(message=msg)
93+
# value_dict can be dict or string (DuckDB JSON returns as string)
94+
if isinstance(value_dict, dict):
95+
return cast("dict[str, Any]", value_dict)
96+
if isinstance(value_dict, str):
97+
parsed: dict[str, Any] = json.loads(value_dict)
98+
return parsed
12499

125-
msg = "Neither value_dict nor value_json column contains data"
100+
msg = f"value_dict has unexpected type: {type(value_dict)}"
126101
raise DeserializationError(message=msg)
127102

128103
def _convert_timestamps_to_utc(self, data: dict[str, Any]) -> None:
@@ -146,9 +121,8 @@ class DuckDBStore(BaseContextManagerStore, BaseStore):
146121
The store uses native DuckDB types (JSON, TIMESTAMP) to enable efficient SQL queries
147122
on stored data. Users can query the database directly for analytics or data exploration.
148123
149-
Storage modes:
150-
- native_storage=True: Stores values in a JSON column as native dicts for queryability
151-
- native_storage=False: Stores values as stringified JSON in a TEXT column
124+
Values are stored in a JSON column as native dicts, allowing direct SQL queries
125+
on the stored data for analytics and reporting.
152126
153127
Note on connection ownership: When you provide an existing connection, the store
154128
will take ownership and close it when the store is closed or garbage collected.
@@ -167,7 +141,6 @@ def __init__(
167141
*,
168142
connection: duckdb.DuckDBPyConnection,
169143
table_name: str = "kv_entries",
170-
native_storage: bool = True,
171144
default_collection: str | None = None,
172145
seed: SEED_DATA_TYPE | None = None,
173146
) -> None:
@@ -180,8 +153,6 @@ def __init__(
180153
Args:
181154
connection: An existing DuckDB connection to use.
182155
table_name: Name of the table to store key-value entries. Defaults to "kv_entries".
183-
native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON.
184-
Default is True for better queryability and native type support.
185156
default_collection: The default collection to use if no collection is provided.
186157
seed: Optional seed data to pre-populate the store.
187158
"""
@@ -192,7 +163,6 @@ def __init__(
192163
*,
193164
database_path: Path | str | None = None,
194165
table_name: str = "kv_entries",
195-
native_storage: bool = True,
196166
default_collection: str | None = None,
197167
seed: SEED_DATA_TYPE | None = None,
198168
) -> None:
@@ -201,8 +171,6 @@ def __init__(
201171
Args:
202172
database_path: Path to the database file. If None or ':memory:', uses in-memory database.
203173
table_name: Name of the table to store key-value entries. Defaults to "kv_entries".
204-
native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON.
205-
Default is True for better queryability and native type support.
206174
default_collection: The default collection to use if no collection is provided.
207175
seed: Optional seed data to pre-populate the store.
208176
"""
@@ -213,7 +181,6 @@ def __init__(
213181
connection: duckdb.DuckDBPyConnection | None = None,
214182
database_path: Path | str | None = None,
215183
table_name: str = "kv_entries",
216-
native_storage: bool = True,
217184
default_collection: str | None = None,
218185
seed: SEED_DATA_TYPE | None = None,
219186
) -> None:
@@ -223,8 +190,6 @@ def __init__(
223190
connection: An existing DuckDB connection to use.
224191
database_path: Path to the database file. If None or ':memory:', uses in-memory database.
225192
table_name: Name of the table to store key-value entries. Defaults to "kv_entries".
226-
native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON.
227-
Default is True for better queryability and native type support.
228193
default_collection: The default collection to use if no collection is provided.
229194
seed: Optional seed data to pre-populate the store.
230195
"""
@@ -248,7 +213,7 @@ def __init__(
248213
self._owns_connection = True
249214

250215
self._is_closed = False
251-
self._adapter = DuckDBSerializationAdapter(native_storage=native_storage)
216+
self._adapter = DuckDBSerializationAdapter()
252217
self._table_name = table_name
253218
self._stable_api = False
254219

@@ -264,8 +229,7 @@ def _get_create_table_sql(self) -> str:
264229
CREATE TABLE IF NOT EXISTS {self._table_name} (
265230
collection VARCHAR NOT NULL,
266231
key VARCHAR NOT NULL,
267-
value_json TEXT,
268-
value_dict JSON,
232+
value_dict JSON NOT NULL,
269233
created_at TIMESTAMP,
270234
expires_at TIMESTAMP,
271235
PRIMARY KEY (collection, key)
@@ -301,7 +265,7 @@ def _get_select_sql(self) -> str:
301265
SQL SELECT statement with placeholders.
302266
"""
303267
return f"""
304-
SELECT value_json, value_dict, created_at, expires_at
268+
SELECT value_dict, created_at, expires_at
305269
FROM {self._table_name}
306270
WHERE collection = ? AND key = ?
307271
""" # noqa: S608
@@ -314,8 +278,8 @@ def _get_insert_sql(self) -> str:
314278
"""
315279
return f"""
316280
INSERT OR REPLACE INTO {self._table_name}
317-
(collection, key, value_json, value_dict, created_at, expires_at)
318-
VALUES (?, ?, ?, ?, ?, ?)
281+
(collection, key, value_dict, created_at, expires_at)
282+
VALUES (?, ?, ?, ?, ?)
319283
""" # noqa: S608
320284

321285
def _get_delete_sql(self) -> str:
@@ -335,17 +299,15 @@ async def _setup(self) -> None:
335299
"""Initialize the database schema for key-value storage.
336300
337301
The schema uses native DuckDB types for efficient querying:
338-
- value_json: TEXT column storing stringified JSON (used when native_storage=False)
339-
- value_dict: JSON column storing native dicts (used when native_storage=True)
302+
- value_dict: JSON column storing native dicts for queryability
340303
- created_at: TIMESTAMP for native datetime operations
341304
- expires_at: TIMESTAMP for native expiration queries
342305
343-
This design follows the Elasticsearch/MongoDB pattern of separating native and stringified
344-
storage, enabling:
345-
- Direct SQL queries on the database for analytics (when using native storage)
306+
This design enables:
307+
- Direct SQL queries on the database for analytics
346308
- Efficient expiration cleanup: DELETE FROM table WHERE expires_at < now()
347309
- Metadata queries without JSON deserialization
348-
- Flexibility to choose between native dict storage and stringified JSON
310+
- Native JSON column support for rich querying capabilities
349311
"""
350312
# Create the main table for storing key-value entries
351313
self._connection.execute(self._get_create_table_sql())
@@ -360,7 +322,7 @@ async def _setup(self) -> None:
360322
async def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry | None:
361323
"""Retrieve a managed entry by key from the specified collection.
362324
363-
Reconstructs the ManagedEntry from value columns and metadata columns
325+
Reconstructs the ManagedEntry from value column and metadata columns
364326
using the serialization adapter.
365327
"""
366328
if self._is_closed:
@@ -375,11 +337,10 @@ async def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry
375337
if result is None:
376338
return None
377339

378-
value_json, value_dict, created_at, expires_at = result
340+
value_dict, created_at, expires_at = result
379341

380-
# Build document dict for the adapter (exclude None values)
342+
# Build document dict for the adapter
381343
document: dict[str, Any] = {
382-
"value_json": value_json,
383344
"value_dict": value_dict,
384345
}
385346

@@ -411,15 +372,14 @@ async def _put_managed_entry(
411372
raise RuntimeError(msg)
412373

413374
# Use adapter to dump the managed entry to a dict with key and collection
414-
document = self._adapter.dump_dict(entry=managed_entry, exclude_none=False, key=key, collection=collection)
375+
document = self._adapter.dump_dict(entry=managed_entry, key=key, collection=collection)
415376

416377
# Insert or replace the entry with metadata in separate columns
417378
self._connection.execute(
418379
self._get_insert_sql(),
419380
[
420381
collection,
421382
key,
422-
document["value_json"],
423383
document["value_dict"],
424384
document.get("created_at"),
425385
document.get("expires_at"),

0 commit comments

Comments
 (0)