-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Closed
Description
# %%
import chromadb
client = chromadb.Client()
# %%
import numpy as np
from tqdm import tqdm
# %%
from chromadb.api.types import Documents, EmbeddingFunction, Embeddings
class MyEmbeddingFunction(EmbeddingFunction):
def __call__(self, texts: Documents) -> Embeddings:
# embed the documents somehow
return np.random.random((len(texts), 512)).tolist()
# %%
collection = client.create_collection(name="clip_image_product", embedding_function=MyEmbeddingFunction())
# %%
collection.count()
# %%
# this takes > 5 hours
for chunk in tqdm(range(10000)):
vectors = np.random.rand(100, 512)
collection.add(
documents=[f"This is a document id{idx+chunk*100}" for idx in range(100)],
metadatas=[{"color": ["red", "yellow", "blue"][idx%3]} for idx in range(100)],
ids=[f"id{idx+chunk*100}" for idx in range(100)],
embeddings=vectors.tolist()
)
# %%
collection.count()
# %%
# this takes 178 ms
hits = collection.query(
query_embeddings=np.random.random((1,512)).tolist(),
n_results=1000
)
so it takes >5 hours to index 1M random vectors, and >100ms to query top 1000 vectors from the 1M. Is this expected performance for chroma?
My use case has 5-20M vectors, and has QPS ~= 1000, P95 latency < 100ms, would chroma be the right tool for this scale?
Metadata
Metadata
Assignees
Labels
No labels