Skip to content

Commit

Permalink
Merge pull request #1 from the-alex-b/0.3.0
Browse files Browse the repository at this point in the history
0.3.0
  • Loading branch information
the-alex-b authored Apr 29, 2023
2 parents 5e9246d + 4e3f893 commit 677063a
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 52 deletions.
6 changes: 4 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Locus
Locus is a local, simple, in-memory vector database based on hnswlib.
Locus is a local, simple, append-only, in-memory vector database based on hnswlib.

## Installation
``` bash
Expand Down Expand Up @@ -33,7 +33,9 @@ for vector in vectors:
# retrieve the closest vectors to a query embedding
query_embedding = np.random.randn(config.dim)
results = index.retrieve(query_embedding, number_of_results=3)
print(results)

print(f"Matches: {results}")
print(f"Items in index: {index.count}")

# store the index on disk
index._store_on_disk()
Expand Down
16 changes: 11 additions & 5 deletions src/locusdb/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,17 @@ def from_file(cls, file="index.db") -> Index:
with open(file, "rb") as handle:
return pickle.load(handle)

def add_vector(self, vector: Vector, persist_on_disk=True) -> None:
def add_vector(self, vector: Vector, persist_on_disk=False) -> None:
storage_id = len(self.structured_memory)

# add to hnsw index
self.hnsw_index.add_items(vector.embedding, len(self.structured_memory))
self.hnsw_index.add_items(vector.embedding, storage_id)

# add to stuctured data
self.structured_memory[len(self.structured_memory)] = vector.data
self.structured_memory[storage_id] = vector.data

if persist_on_disk:
self._store_on_disk()
self.persist_on_disk()

def retrieve(self, embedding: np.array, number_of_results: int = 3) -> list[dict]:
labels, distances = self.hnsw_index.knn_query(embedding, k=number_of_results)
Expand All @@ -84,9 +86,13 @@ def retrieve(self, embedding: np.array, number_of_results: int = 3) -> list[dict
for i, id in enumerate(labels[0])
]

def _store_on_disk(self) -> None:
def persist_on_disk(self) -> None:
with open(
self.config.storage_location,
"wb",
) as handle:
pickle.dump(self, handle, protocol=pickle.HIGHEST_PROTOCOL)

@property
def count(self) -> int:
return len(self.structured_memory)
91 changes: 46 additions & 45 deletions src/run.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,53 @@
# from locus import Index, Vector, Config
# import numpy as np
# from pprint import pprint
import numpy as np
from locusdb import Config, Index, Vector
import cProfile

# index = Index(dimensions=10)

# embedding = np.float32(np.random.random((1, 10)))
# structured_data = {"a": 1}
# vector = Vector(embedding=embedding, data=structured_data)
def profile_performance():
num_of_elements = 1000

# create a new configuration
config = Config(
max_elements=num_of_elements,
ef_construction=200,
M=16,
dim=128,
space="cosine",
storage_location="index.db",
)

# index.add_vector(vector)
# create a new index instance
index = Index(dimensions=config.dim, config=config)

import numpy as np
# create some random vectors
vectors = []
for i in range(num_of_elements):
embedding = np.random.randn(config.dim)
data = {"id": i, "message": f"test message {i}"}
vector = Vector(embedding=embedding, data=data)
vectors.append(vector)

# add the vectors to the index
for vector in vectors:
index.add_vector(vector, persist_on_disk=False) # persisting is very expensive

# retrieve the closest vectors to a query embedding
query_embedding = np.random.randn(config.dim)
results = index.retrieve(query_embedding, number_of_results=3)

print(f"Matches: {results}")
print(f"Items in index: {index.count}")

# store the index on disk
index.persist_on_disk()

# load the index from disk
new_index = Index.from_file(config.storage_location)

# retrieve the closest vectors to a query embedding
# query_embedding = np.random.randn(config.dim)
# results = new_index.retrieve(query_embedding, number_of_results=3)
# print(results)

from locusdb import Config, Index, Vector

# create a new configuration
config = Config(
max_elements=1000,
ef_construction=200,
M=16,
dim=128,
space="cosine",
storage_location="index.db",
)

# create a new index instance
index = Index(dimensions=config.dim, config=config)

# create some random vectors
vectors = []
for i in range(10):
embedding = np.random.randn(config.dim)
data = {"id": i, "message": f"test message {i}"}
vector = Vector(embedding=embedding, data=data)
vectors.append(vector)

# add the vectors to the index
for vector in vectors:
index.add_vector(vector)

# retrieve the closest vectors to a query embedding
query_embedding = np.random.randn(config.dim)
results = index.retrieve(query_embedding, number_of_results=3)
print(results)

# store the index on disk
index._store_on_disk()

# load the index from disk
new_index = Index.from_file(config.storage_location)
cProfile.run("profile_performance()")

0 comments on commit 677063a

Please sign in to comment.