Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions dictdatabase/dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,14 @@ class SearchResult:
start_byte: int
end_byte: int
found: bool


@dataclasses.dataclass(frozen=True)
class Index:
key: str
key_start: int
key_end: int
indent_level: int
indent_with: str
value_hash: str
old_value_end: int
41 changes: 20 additions & 21 deletions dictdatabase/index_manager.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,27 @@
import hashlib

from dictdatabase import utils
from dictdatabase.dataclasses import Index


class IndexManager:
@staticmethod
def create_index(all_file_bytes: bytes, key: str, start, end):
"""
It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its
value
def create_index(all_file_bytes: bytes, key: str, start, end) -> Index:
"""
It takes a JSON file, a key, and a start and end position, and returns a tuple of information about the key and its
value

Args:
all_file_bytes (bytes): The entire file as a byte string.
key (str): The key of the value we're indexing.
start: the start of the value in the file
end: the end of the value in the file
Args:
all_file_bytes (bytes): The entire file as a byte string.
key (str): The key of the value we're indexing.
start: the start of the value in the file
end: the end of the value in the file

Returns:
The key, start, end, indent_level, indent_with, value_hash, end
"""
key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
indent_level, indent_with = utils.detect_indentation_in_json_bytes(
all_file_bytes, key_start
)
value_bytes = all_file_bytes[start:end]
value_hash = hashlib.sha256(value_bytes).hexdigest()
return key, start, end, indent_level, indent_with, value_hash, end
Returns:
The key, start, end, indent_level, indent_with, value_hash, end
"""
key_start, key_end = utils.find_outermost_key_in_json_bytes(all_file_bytes, key)
indent_level, indent_with = utils.detect_indentation_in_json_bytes(
all_file_bytes, key_start
)
value_bytes = all_file_bytes[start:end]
value_hash = hashlib.sha256(value_bytes).hexdigest()
return Index(key, start, end, indent_level, indent_with, value_hash, end)
121 changes: 63 additions & 58 deletions dictdatabase/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import orjson

from . import config
from .dataclasses import Index


# Problem: Multiple read processes will concurrently read and write the same file
Expand All @@ -25,61 +26,65 @@


class Indexer:
"""
The Indexer takes the name of a database file, and tries to load the .index file
of the corresponding database file.

The name of the index file is the name of the database file, with the extension
.index and all "/" replaced with "___"

The content of the index file is a json object, where the keys are keys inside
the database json file, and the values are lists of 5 elements:
- start_index: The index of the first byte of the value of the key in the database file
- end_index: The index of the last byte of the value of the key in the database file
- indent_level: The indent level of the key in the database file
- indent_with: The indent string used.
- value_hash: The hash of the value bytes
"""

__slots__ = ("data", "path")

def __init__(self, db_name: str):
# Make path of index file
db_name = db_name.replace("/", "___")
self.path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index")

os.makedirs(os.path.dirname(self.path), exist_ok=True)
if not os.path.exists(self.path):
self.data = {}
return

try:
with open(self.path, "rb") as f:
self.data = orjson.loads(f.read())
except orjson.JSONDecodeError:
self.data = {}


def get(self, key):
"""
Returns a list of 5 elements for a key if it exists, otherwise None
Elements:[start_index, end_index, indent_level, indent_with, value_hash]
"""
return self.data.get(key, None)


def write(self, key, start_index, end_index, indent_level, indent_with, value_hash, old_value_end):
"""
Write index information for a key to the index file
"""

if self.data.get(key, None) is not None:
delta = end_index - old_value_end
for entry in self.data.values():
if entry[0] > old_value_end:
entry[0] += delta
entry[1] += delta

self.data[key] = [start_index, end_index, indent_level, indent_with, value_hash]
with open(self.path, "wb") as f:
f.write(orjson.dumps(self.data))
"""
The Indexer takes the name of a database file, and tries to load the .index file
of the corresponding database file.

The name of the index file is the name of the database file, with the extension
.index and all "/" replaced with "___"

The content of the index file is a json object, where the keys are keys inside
the database json file, and the values are lists of 5 elements:
- start_index: The index of the first byte of the value of the key in the database file
- end_index: The index of the last byte of the value of the key in the database file
- indent_level: The indent level of the key in the database file
- indent_with: The indent string used.
- value_hash: The hash of the value bytes
"""

__slots__ = ("data", "path")

def __init__(self, db_name: str):
# Make path of index file
db_name = db_name.replace("/", "___")
self.path = os.path.join(config.storage_directory, ".ddb", f"{db_name}.index")

os.makedirs(os.path.dirname(self.path), exist_ok=True)
if not os.path.exists(self.path):
self.data = {}
return

try:
with open(self.path, "rb") as f:
self.data = orjson.loads(f.read())
except orjson.JSONDecodeError:
self.data = {}

def get(self, key):
"""
Returns a list of 5 elements for a key if it exists, otherwise None
Elements:[start_index, end_index, indent_level, indent_with, value_hash]
"""
return self.data.get(key, None)

def write(self, index: Index):
"""
Write index information for a key to the index file
"""

if self.data.get(index.key, None) is not None:
delta = index.key_end - index.old_value_end
for entry in self.data.values():
if entry[0] > index.old_value_end:
entry[0] += delta
entry[1] += delta

self.data[index.key] = [
index.key_start,
index.key_end,
index.indent_level,
index.indent_with,
index.value_hash,
]
with open(self.path, "wb") as f:
f.write(orjson.dumps(self.data))
Loading