Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion breaking-changes.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ This is a summary of changes:
- Collapsed the `vector_store` dict into a single root-level object. This is because we no longer support multi-search, and this dict required a lot of downstream complexity for that single use case.
- Removed the `outputs` block that was also only used for multi-search.
- Most workflows had an undocumented `strategy` config dict that allowed fine tuning of internal settings. These fine tunings are never used and had associated complexity, so we removed it.
- Vector store configuration now allows custom schema per embedded field. This overrides the need for the `container_name` prefix, which caused confusion anyway. Now, the default container name will simply be the embedded field name - if you need something custom, add the `embeddings_schema` block and populate as needed.
- Vector store configuration now allows custom schema per embedded field. This overrides the need for the `container_name` prefix, which caused confusion anyway. Now, the default container name will simply be the embedded field name - if you need something custom, add the `index_schema` block and populate as needed.
- We previously supported the ability to embed any text field in the data model. However, we only ever use text_unit_text, entity_description, and community_full_content, so all others have been removed.
- Removed the `umap` and `embed_graph` blocks which were only used to add x/y fields to the entities. This fixed a long-standing dependency issue with graspologic. If you need x/y positions, see the [visualization guide](https://microsoft.github.io/graphrag/visualization_guide/) for using gephi.
- Removed file filtering from input document loading. This was essentially unused.
Expand Down
6 changes: 2 additions & 4 deletions docs/config/yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -172,9 +172,8 @@ Where to put all vectors for the system. Configured for lancedb by default. This
- `url` **str** (only for AI Search) - AI Search endpoint
- `api_key` **str** (optional - only for AI Search) - The AI Search api key to use.
- `audience` **str** (only for AI Search) - Audience for managed identity token if managed identity authentication is used.
- `index_prefix` **str** - (optional) A prefix for the indexes you will create for embeddings. This stores all indexes (tables) for a given dataset ingest.
- `database_name` **str** - (cosmosdb only) Name of the database.
- `embeddings_schema` **dict[str, dict[str, str]]** (optional) - Enables customization for each of your embeddings.
- `index_schema` **dict[str, dict[str, str]]** (optional) - Enables customization for each of your embeddings.
- `<supported_embedding>`:
- `index_name` **str**: (optional) - Name for the specific embedding index table.
- `id_field` **str**: (optional) - Field name to be used as id. Default=`id`
Expand All @@ -193,8 +192,7 @@ For example:
vector_store:
type: lancedb
db_uri: output/lancedb
index_prefix: "christmas-carol"
embeddings_schema:
index_schema:
text_unit_text:
index_name: "text-unit-embeddings"
id_field: "id_custom"
Expand Down
234 changes: 109 additions & 125 deletions docs/examples_notebooks/custom_vector_store.ipynb

Large diffs are not rendered by default.

15 changes: 7 additions & 8 deletions docs/examples_notebooks/drift_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
"from graphrag.config.enums import ModelType\n",
"from graphrag.config.models.drift_search_config import DRIFTSearchConfig\n",
"from graphrag.config.models.language_model_config import LanguageModelConfig\n",
"from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig\n",
"from graphrag.language_model.manager import ModelManager\n",
"from graphrag.query.indexer_adapters import (\n",
" read_indexer_entities,\n",
Expand All @@ -36,7 +35,7 @@
")\n",
"from graphrag.query.structured_search.drift_search.search import DRIFTSearch\n",
"from graphrag.tokenizer.get_tokenizer import get_tokenizer\n",
"from graphrag.vector_stores.lancedb import LanceDBVectorStore\n",
"from graphrag_vectors.lancedb import LanceDBVectorStore\n",
"\n",
"INPUT_DIR = \"./inputs/operation dulce\"\n",
"LANCEDB_URI = f\"{INPUT_DIR}/lancedb\"\n",
Expand All @@ -61,16 +60,16 @@
"# load description embeddings to an in-memory lancedb vectorstore\n",
"# to connect to a remote db, specify url and port values.\n",
"description_embedding_store = LanceDBVectorStore(\n",
" vector_store_schema_config=VectorStoreSchemaConfig(index_name=\"entity_description\"),\n",
" db_uri=LANCEDB_URI,\n",
" index_name=\"entity_description\",\n",
")\n",
"description_embedding_store.connect(db_uri=LANCEDB_URI)\n",
"description_embedding_store.connect()\n",
"\n",
"full_content_embedding_store = LanceDBVectorStore(\n",
" vector_store_schema_config=VectorStoreSchemaConfig(\n",
" index_name=\"community_full_content\"\n",
" )\n",
" db_uri=LANCEDB_URI,\n",
" index_name=\"community_full_content\",\n",
")\n",
"full_content_embedding_store.connect(db_uri=LANCEDB_URI)\n",
"full_content_embedding_store.connect()\n",
"\n",
"print(f\"Entity count: {len(entity_df)}\")\n",
"entity_df.head()\n",
Expand Down
7 changes: 2 additions & 5 deletions docs/examples_notebooks/local_search.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
"import os\n",
"\n",
"import pandas as pd\n",
"from graphrag.config.models.vector_store_schema_config import VectorStoreSchemaConfig\n",
"from graphrag.query.context_builder.entity_extraction import EntityVectorStoreKey\n",
"from graphrag.query.indexer_adapters import (\n",
" read_indexer_covariates,\n",
Expand All @@ -33,7 +32,7 @@
" LocalSearchMixedContext,\n",
")\n",
"from graphrag.query.structured_search.local_search.search import LocalSearch\n",
"from graphrag.vector_stores.lancedb import LanceDBVectorStore"
"from graphrag_vectors import IndexSchema, LanceDBVectorStore"
]
},
{
Expand Down Expand Up @@ -101,9 +100,7 @@
"# load description embeddings to an in-memory lancedb vectorstore\n",
"# to connect to a remote db, specify url and port values.\n",
"description_embedding_store = LanceDBVectorStore(\n",
" vector_store_schema_config=VectorStoreSchemaConfig(\n",
" index_name=\"default-entity-description\"\n",
" )\n",
" index_schema=IndexSchema(index_name=\"default-entity-description\")\n",
")\n",
"description_embedding_store.connect(db_uri=LANCEDB_URI)\n",
"\n",
Expand Down
109 changes: 109 additions & 0 deletions packages/graphrag-vectors/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# GraphRAG Vectors

Vector store implementations for GraphRAG.

## Basic Usage

### Using the utility function (recommended)

```python
from graphrag_vectors import (
create_vector_store,
VectorStoreType,
IndexSchema,
)

# Create a vector store using the convenience function
store_config = VectorStoreConfig(
type="lancedb",
db_uri="lance"
)

schema_config = IndexSchema(
index_name="my_index",
vector_size=1536,
)

vector_store = create_vector_store(
config=store_config
index_schema=schema_config,
)

vector_store.connect()
vector_store.create_index()
```

### Using the factory directly

```python
from graphrag_vectors import (
VectorStoreFactory,
vector_store_factory,
VectorStoreType,
IndexSchema,
)

# Create a vector store using the factory
schema_config = IndexSchema(
index_name="my_index",
vector_size=1536,
)

vector_store = vector_store_factory.create(
VectorStoreType.LanceDB,
{
"index_schema": schema_config,
"db_uri": "./lancedb"
}
)

vector_store.connect()
vector_store.create_index()
```

## Supported Vector Stores

- **LanceDB**: Local vector database
- **Azure AI Search**: Azure's managed search service with vector capabilities
- **Azure Cosmos DB**: Azure's NoSQL database with vector search support

## Custom Vector Store

You can register custom vector store implementations:

```python
from graphrag_vectors import VectorStore, register_vector_store, create_vector_store

class MyCustomVectorStore(VectorStore):
def __init__(self, my_param):
self.my_param = my_param

def connect(self):
# Implementation
pass

def create_index(self):
# Implementation
pass

# ... implement other required methods

# Register your custom implementation
register_vector_store("my_custom_store", MyCustomVectorStore)

# Use your custom vector store
config = VectorStoreConfig(
type="my_custom_store",
my_param="something"
)
custom_store = create_vector_store(
config=config,
index_schema=schema_config,
)
```

## Configuration

Vector stores are configured using:
- `VectorStoreConfig`: baseline parameters for the store
- `IndexSchema`: Schema configuration for the specific index to create/connect to (index name, field names, vector size)
34 changes: 34 additions & 0 deletions packages/graphrag-vectors/graphrag_vectors/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright (c) 2024 Microsoft Corporation.
# Licensed under the MIT License

"""GraphRAG vector store implementations."""

from graphrag_vectors.index_schema import IndexSchema
from graphrag_vectors.types import TextEmbedder
from graphrag_vectors.vector_store import (
VectorStore,
VectorStoreDocument,
VectorStoreSearchResult,
)
from graphrag_vectors.vector_store_config import VectorStoreConfig
from graphrag_vectors.vector_store_factory import (
VectorStoreFactory,
create_vector_store,
register_vector_store,
vector_store_factory,
)
from graphrag_vectors.vector_store_type import VectorStoreType

__all__ = [
"IndexSchema",
"TextEmbedder",
"VectorStore",
"VectorStoreConfig",
"VectorStoreDocument",
"VectorStoreFactory",
"VectorStoreSearchResult",
"VectorStoreType",
"create_vector_store",
"register_vector_store",
"vector_store_factory",
]
Original file line number Diff line number Diff line change
Expand Up @@ -22,49 +22,59 @@
)
from azure.search.documents.models import VectorizedQuery

from graphrag.data_model.types import TextEmbedder
from graphrag.vector_stores.base import (
BaseVectorStore,
from graphrag_vectors.vector_store import (
VectorStore,
VectorStoreDocument,
VectorStoreSearchResult,
)


class AzureAISearchVectorStore(BaseVectorStore):
class AzureAISearchVectorStore(VectorStore):
"""Azure AI Search vector storage implementation."""

index_client: SearchIndexClient

def connect(self, **kwargs: Any) -> Any:
def __init__(
self,
url: str,
api_key: str | None = None,
audience: str | None = None,
vector_search_profile_name: str = "vectorSearchProfile",
**kwargs: Any,
):
super().__init__(**kwargs)
if not url:
msg = "url must be provided for Azure AI Search."
raise ValueError(msg)
self.url = url
self.api_key = api_key
self.audience = audience
self.vector_search_profile_name = vector_search_profile_name

def connect(self) -> Any:
"""Connect to AI search vector storage."""
url = kwargs["url"]
api_key = kwargs.get("api_key")
audience = kwargs.get("audience")

self.vector_search_profile_name = kwargs.get(
"vector_search_profile_name", "vectorSearchProfile"
audience_arg = (
{"audience": self.audience} if self.audience and not self.api_key else {}
)
self.db_connection = SearchClient(
endpoint=self.url,
index_name=self.index_name,
credential=(
AzureKeyCredential(self.api_key)
if self.api_key
else DefaultAzureCredential()
),
**audience_arg,
)
self.index_client = SearchIndexClient(
endpoint=self.url,
credential=(
AzureKeyCredential(self.api_key)
if self.api_key
else DefaultAzureCredential()
),
**audience_arg,
)

if url:
audience_arg = {"audience": audience} if audience and not api_key else {}
self.db_connection = SearchClient(
endpoint=url,
index_name=self.index_name if self.index_name else "",
credential=(
AzureKeyCredential(api_key) if api_key else DefaultAzureCredential()
),
**audience_arg,
)
self.index_client = SearchIndexClient(
endpoint=url,
credential=(
AzureKeyCredential(api_key) if api_key else DefaultAzureCredential()
),
**audience_arg,
)
else:
not_supported_error = "Azure AI Search expects `url`."
raise ValueError(not_supported_error)

def create_index(self) -> None:
"""Load documents into an Azure AI Search index."""
Expand Down Expand Up @@ -93,7 +103,7 @@ def create_index(self) -> None:
)
# Configure the index
index = SearchIndex(
name=self.index_name if self.index_name else "",
name=self.index_name,
fields=[
SimpleField(
name=self.id_field,
Expand Down Expand Up @@ -154,17 +164,6 @@ def similarity_search_by_vector(
for doc in response
]

def similarity_search_by_text(
self, text: str, text_embedder: TextEmbedder, k: int = 10
) -> list[VectorStoreSearchResult]:
"""Perform a text-based similarity search."""
query_embedding = text_embedder(text)
if query_embedding:
return self.similarity_search_by_vector(
query_embedding=query_embedding, k=k
)
return []

def search_by_id(self, id: str) -> VectorStoreDocument:
"""Search for a document by id."""
response = self.db_connection.get_document(id)
Expand Down
Loading