Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add namespaced vector stores to storage context #8753

Merged
merged 4 commits into from
Nov 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 69 additions & 17 deletions llama_index/storage/storage_context.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Union
from typing import Dict, Optional, Union

import fsspec

Expand All @@ -24,7 +24,11 @@
from llama_index.storage.index_store.types import BaseIndexStore
from llama_index.utils import concat_dirs
from llama_index.vector_stores.simple import DEFAULT_PERSIST_FNAME as VECTOR_STORE_FNAME
from llama_index.vector_stores.simple import SimpleVectorStore
from llama_index.vector_stores.simple import (
DEFAULT_VECTOR_STORE,
NAMESPACE_SEP,
SimpleVectorStore,
)
from llama_index.vector_stores.types import VectorStore

DEFAULT_PERSIST_DIR = "./storage"
Expand All @@ -45,7 +49,7 @@ class StorageContext:

docstore: BaseDocumentStore
index_store: BaseIndexStore
vector_store: VectorStore
vector_stores: Dict[str, VectorStore]
graph_store: GraphStore
hatianzhang marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
Expand All @@ -54,6 +58,7 @@ def from_defaults(
docstore: Optional[BaseDocumentStore] = None,
index_store: Optional[BaseIndexStore] = None,
vector_store: Optional[VectorStore] = None,
vector_stores: Optional[Dict[str, VectorStore]] = None,
graph_store: Optional[GraphStore] = None,
persist_dir: Optional[str] = None,
fs: Optional[fsspec.AbstractFileSystem] = None,
Expand All @@ -70,23 +75,40 @@ def from_defaults(
if persist_dir is None:
docstore = docstore or SimpleDocumentStore()
index_store = index_store or SimpleIndexStore()
vector_store = vector_store or SimpleVectorStore()
graph_store = graph_store or SimpleGraphStore()

if vector_store:
vector_stores = {DEFAULT_VECTOR_STORE: vector_store}
else:
vector_stores = vector_stores or {
DEFAULT_VECTOR_STORE: SimpleVectorStore()
}
else:
docstore = docstore or SimpleDocumentStore.from_persist_dir(
persist_dir, fs=fs
)
index_store = index_store or SimpleIndexStore.from_persist_dir(
persist_dir, fs=fs
)
vector_store = vector_store or SimpleVectorStore.from_persist_dir(
persist_dir, fs=fs
)
graph_store = graph_store or SimpleGraphStore.from_persist_dir(
persist_dir, fs=fs
)

return cls(docstore, index_store, vector_store, graph_store)
if vector_store:
vector_stores = {DEFAULT_VECTOR_STORE: vector_store}
elif vector_stores:
vector_stores = vector_stores
else:
vector_stores = SimpleVectorStore.from_namespaced_persist_dir(
persist_dir, fs=fs
)

return cls(
docstore=docstore,
index_store=index_store,
vector_stores=vector_stores,
graph_store=graph_store,
)

def persist(
self,
Expand All @@ -106,39 +128,56 @@ def persist(
persist_dir = str(persist_dir) # NOTE: doesn't support Windows here
docstore_path = concat_dirs(persist_dir, docstore_fname)
index_store_path = concat_dirs(persist_dir, index_store_fname)
vector_store_path = concat_dirs(persist_dir, vector_store_fname)
graph_store_path = concat_dirs(persist_dir, graph_store_fname)
else:
persist_dir = Path(persist_dir)
docstore_path = str(persist_dir / docstore_fname)
index_store_path = str(persist_dir / index_store_fname)
vector_store_path = str(persist_dir / vector_store_fname)
graph_store_path = str(persist_dir / graph_store_fname)

self.docstore.persist(persist_path=docstore_path, fs=fs)
self.index_store.persist(persist_path=index_store_path, fs=fs)
self.vector_store.persist(persist_path=vector_store_path, fs=fs)
self.graph_store.persist(persist_path=graph_store_path, fs=fs)

# save each vector store under it's namespace
for vector_store_name, vector_store in self.vector_stores.items():
if fs is not None:
vector_store_path = concat_dirs(
str(persist_dir),
f"{vector_store_name}{NAMESPACE_SEP}{vector_store_fname}",
)
else:
vector_store_path = str(
Path(persist_dir)
/ f"{vector_store_name}{NAMESPACE_SEP}{vector_store_fname}"
)

vector_store.persist(persist_path=vector_store_path, fs=fs)

def to_dict(self) -> dict:
all_simple = (
isinstance(self.vector_store, SimpleVectorStore)
and isinstance(self.docstore, SimpleDocumentStore)
isinstance(self.docstore, SimpleDocumentStore)
and isinstance(self.index_store, SimpleIndexStore)
and isinstance(self.graph_store, SimpleGraphStore)
and all(
isinstance(vs, SimpleVectorStore) for vs in self.vector_stores.values()
)
)
if not all_simple:
raise ValueError(
"to_dict only available when using simple doc/index/vector stores"
)

assert isinstance(self.vector_store, SimpleVectorStore)
assert isinstance(self.docstore, SimpleDocumentStore)
assert isinstance(self.index_store, SimpleIndexStore)
assert isinstance(self.graph_store, SimpleGraphStore)

return {
VECTOR_STORE_KEY: self.vector_store.to_dict(),
VECTOR_STORE_KEY: {
key: vector_store.to_dict()
for key, vector_store in self.vector_stores.items()
if isinstance(vector_store, SimpleVectorStore)
},
DOC_STORE_KEY: self.docstore.to_dict(),
INDEX_STORE_KEY: self.index_store.to_dict(),
GRAPH_STORE_KEY: self.graph_store.to_dict(),
Expand All @@ -148,12 +187,25 @@ def to_dict(self) -> dict:
def from_dict(cls, save_dict: dict) -> "StorageContext":
"""Create a StorageContext from dict."""
docstore = SimpleDocumentStore.from_dict(save_dict[DOC_STORE_KEY])
vector_store = SimpleVectorStore.from_dict(save_dict[VECTOR_STORE_KEY])
index_store = SimpleIndexStore.from_dict(save_dict[INDEX_STORE_KEY])
graph_store = SimpleGraphStore.from_dict(save_dict[GRAPH_STORE_KEY])

vector_stores: Dict[str, VectorStore] = {}
for key, vector_store_dict in save_dict[VECTOR_STORE_KEY].items():
vector_stores[key] = SimpleVectorStore.from_dict(vector_store_dict)

return cls(
docstore=docstore,
index_store=index_store,
vector_store=vector_store,
vector_stores=vector_stores,
graph_store=graph_store,
)

@property
def vector_store(self) -> VectorStore:
hatianzhang marked this conversation as resolved.
Show resolved Hide resolved
"""Backwrds compatibility for vector_store property."""
return self.vector_stores[DEFAULT_VECTOR_STORE]

def add_vector_store(self, vector_store: VectorStore, namespace: str) -> None:
"""Add a vector store to the storage context."""
self.vector_stores[namespace] = vector_store
37 changes: 35 additions & 2 deletions llama_index/vector_stores/simple.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@

MMR_MODE = VectorStoreQueryMode.MMR

NAMESPACE_SEP = "__"
DEFAULT_VECTOR_STORE = "default"


def _build_metadata_filter_fn(
metadata_lookup_fn: Callable[[str], Mapping[str, Any]],
Expand Down Expand Up @@ -107,15 +110,45 @@ def __init__(
def from_persist_dir(
cls,
persist_dir: str = DEFAULT_PERSIST_DIR,
namespace: Optional[str] = None,
fs: Optional[fsspec.AbstractFileSystem] = None,
) -> "SimpleVectorStore":
"""Load from persist dir."""
if namespace:
persist_fname = f"{namespace}{NAMESPACE_SEP}{DEFAULT_PERSIST_FNAME}"
else:
persist_fname = DEFAULT_PERSIST_FNAME

if fs is not None:
persist_path = concat_dirs(persist_dir, DEFAULT_PERSIST_FNAME)
persist_path = concat_dirs(persist_dir, persist_fname)
else:
persist_path = os.path.join(persist_dir, DEFAULT_PERSIST_FNAME)
persist_path = os.path.join(persist_dir, persist_fname)
return cls.from_persist_path(persist_path, fs=fs)

@classmethod
def from_namespaced_persist_dir(
cls,
persist_dir: str = DEFAULT_PERSIST_DIR,
fs: Optional[fsspec.AbstractFileSystem] = None,
) -> Dict[str, VectorStore]:
"""Load from namespaced persist dir."""
vector_stores: Dict[str, VectorStore] = {}
for fname in os.listdir(persist_dir):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@logan-markewich I think this line may have broken using a remote fs like gcsfs?

if fname.endswith(DEFAULT_PERSIST_FNAME):
namespace = fname.split(NAMESPACE_SEP)[0]

# handle backwards compatibility with stores that were persisted
if namespace == DEFAULT_PERSIST_FNAME:
vector_stores[DEFAULT_VECTOR_STORE] = cls.from_persist_dir(
persist_dir=persist_dir, fs=fs
)
else:
vector_stores[namespace] = cls.from_persist_dir(
persist_dir=persist_dir, namespace=namespace, fs=fs
)

return vector_stores

@property
def client(self) -> None:
"""Get client."""
Expand Down