diff --git a/docs/examples_notebooks/custom_vector_store.ipynb b/docs/examples_notebooks/custom_vector_store.ipynb index 2e79c66d8..22c9850d9 100644 --- a/docs/examples_notebooks/custom_vector_store.ipynb +++ b/docs/examples_notebooks/custom_vector_store.ipynb @@ -34,7 +34,7 @@ "4. Testing and validating your implementation\n", "5. Configuring GraphRAG to use your custom vector store\n", "\n", - "Let's get started!" + "Let's get started!\n" ] }, { @@ -47,7 +47,7 @@ "\n", "```bash\n", "pip install graphrag\n", - "```" + "```\n" ] }, { @@ -56,14 +56,40 @@ "source": [ "## Step 2: Understand the VectorStore Interface\n", "\n", - "Before using a custom vector store, let's examine the `VectorStore` interface to understand what methods need to be implemented." + "Before using a custom vector store, let's examine the `VectorStore` interface to understand what methods need to be implemented.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "VectorStore Abstract Methods:\n", + "================================================================================\n", + "\n", + "connect:\n", + " (self) -> None\n", + "\n", + "create_index:\n", + " (self) -> None\n", + "\n", + "load_documents:\n", + " (self, documents: list[graphrag_vectors.vector_store.VectorStoreDocument]) -> None\n", + "\n", + "search_by_id:\n", + " (self, id: str) -> graphrag_vectors.vector_store.VectorStoreDocument\n", + "\n", + "similarity_search_by_vector:\n", + " (self, query_embedding: list[float], k: int = 10) -> list[graphrag_vectors.vector_store.VectorStoreSearchResult]\n", + "\n", + "Total abstract methods to implement: 5\n" + ] + } + ], "source": [ "import inspect\n", "\n", @@ -76,9 +102,11 @@ " IndexSchema,\n", " TextEmbedder,\n", " VectorStore,\n", + " VectorStoreConfig,\n", " VectorStoreDocument,\n", - " VectorStoreFactory,\n", " VectorStoreSearchResult,\n", + " create_vector_store,\n", + " register_vector_store,\n", ")\n", "\n", "print(\"VectorStore Abstract Methods:\")\n", @@ -105,12 +133,12 @@ "- Store documents and vectors in memory using Python data structures\n", "- Support all required VectorStore methods\n", "\n", - "**Note**: This is a simplified example for demonstration. Production vector stores would typically use optimized libraries like FAISS, more sophisticated indexing, and persistent storage." + "**Note**: This is a simplified example for demonstration. Production vector stores would typically use optimized libraries like FAISS, more sophisticated indexing, and persistent storage.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 29, "metadata": {}, "outputs": [], "source": [ @@ -130,10 +158,14 @@ " vectors: dict[str, np.ndarray]\n", " connected: bool\n", "\n", - " def __init__(self, **kwargs: Any):\n", + " def __init__(self, custom_config_option: str, **kwargs: Any):\n", " \"\"\"Initialize the in-memory vector store.\"\"\"\n", " super().__init__(**kwargs)\n", "\n", + " # Not actually used in this simple implementation, but included to show\n", + " # how custom configuration options can be passed.\n", + " self.custom_config_option = custom_config_option\n", + "\n", " self.documents: dict[str, VectorStoreDocument] = {}\n", " self.vectors: dict[str, np.ndarray] = {}\n", " self.connected = False\n", @@ -162,6 +194,9 @@ " self, documents: list[VectorStoreDocument], overwrite: bool = False\n", " ) -> None:\n", " \"\"\"Load documents into the vector store.\"\"\"\n", + " if not self.connected:\n", + " msg = \"Vector store is not connected. Call connect() first.\"\n", + " raise RuntimeError(msg)\n", " if overwrite:\n", " print(\"Clearing existing documents...\")\n", " self.documents.clear()\n", @@ -179,6 +214,9 @@ " self, query_embedding: list[float], k: int = 10, **kwargs: Any\n", " ) -> list[VectorStoreSearchResult]:\n", " \"\"\"Search for similar documents using a query vector.\"\"\"\n", + " if not self.connected:\n", + " msg = \"Vector store is not connected. Call connect() first.\"\n", + " raise RuntimeError(msg)\n", " if not self.vectors:\n", " return []\n", "\n", @@ -219,9 +257,9 @@ " # Use vector search\n", " return self.similarity_search_by_vector(query_embedding, k, **kwargs)\n", "\n", - " def search_by_id(self, id: str) -> VectorStoreDocument | None:\n", + " def search_by_id(self, id: str) -> VectorStoreDocument:\n", " \"\"\"Retrieve a document by its ID.\"\"\"\n", - " return self.documents.get(id)" + " return self.documents[id]" ] }, { @@ -230,29 +268,30 @@ "source": [ "## Step 4: Register the Custom Vector Store\n", "\n", - "Now let's register our custom vector store with the `VectorStoreFactory` so it can be used throughout GraphRAG." + "Now let's register our custom vector store with the `VectorStoreFactory` so it can be used throughout GraphRAG.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Registered custom vector store with type: 'simple_memory'\n" + ] + } + ], "source": [ "# Register our custom vector store with a unique identifier\n", "CUSTOM_VECTOR_STORE_TYPE = \"simple_memory\"\n", "\n", "# Register the vector store class\n", - "VectorStoreFactory().register(CUSTOM_VECTOR_STORE_TYPE, SimpleInMemoryVectorStore)\n", - "\n", - "print(f\"โœ… Registered custom vector store with type: '{CUSTOM_VECTOR_STORE_TYPE}'\")\n", + "register_vector_store(CUSTOM_VECTOR_STORE_TYPE, SimpleInMemoryVectorStore)\n", "\n", - "# Verify registration\n", - "available_types = VectorStoreFactory().keys()\n", - "print(f\"\\n๐Ÿ“‹ Available vector store types: {available_types}\")\n", - "print(\n", - " f\"๐Ÿ” Is our custom type supported? {CUSTOM_VECTOR_STORE_TYPE in VectorStoreFactory()}\"\n", - ")" + "print(f\"โœ… Registered custom vector store with type: '{CUSTOM_VECTOR_STORE_TYPE}'\")" ] }, { @@ -261,14 +300,22 @@ "source": [ "## Step 5: Test the Custom Vector Store\n", "\n", - "Let's create some sample data and test our custom vector store implementation." + "Let's create some sample data and test our custom vector store implementation.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿ“ Created 4 sample documents\n" + ] + } + ], "source": [ "# Create sample documents with mock embeddings\n", "def create_mock_embedding(dimension: int = 384) -> list[float]:\n", @@ -301,48 +348,81 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Created vector store instance: SimpleInMemoryVectorStore\n" + ] + } + ], "source": [ "# Test creating vector store using the factory\n", "schema = IndexSchema(index_name=\"test_collection\")\n", "\n", "# Create vector store instance using factory\n", - "vector_store = VectorStoreFactory().create(\n", - " CUSTOM_VECTOR_STORE_TYPE, {\"index_schema\": schema}\n", + "vector_store: VectorStore = create_vector_store(\n", + " VectorStoreConfig(\n", + " type=CUSTOM_VECTOR_STORE_TYPE,\n", + " custom_config_option=\"example_value\", # type: ignore\n", + " ),\n", + " schema,\n", ")\n", "\n", - "print(f\"โœ… Created vector store instance: {type(vector_store).__name__}\")\n", - "print(f\"๐Ÿ“Š Initial stats: {vector_store.get_stats()}\")" + "print(f\"โœ… Created vector store instance: {type(vector_store).__name__}\")" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 33, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Connecting to in-memory vector store...\n", + "Connected successfully!\n", + "Creating index: test_collection\n", + "Index created successfully!\n", + "Loading 4 documents...\n", + "Successfully loaded 4 documents!\n" + ] + } + ], "source": [ "# Connect and load documents\n", "vector_store.connect()\n", "vector_store.create_index()\n", - "vector_store.load_documents(sample_documents)\n", - "\n", - "print(f\"๐Ÿ“Š Updated stats: {vector_store.get_stats()}\")" + "vector_store.load_documents(sample_documents)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 34, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿ” Found 3 similar documents:\n", + "\n", + "1. ID: doc_3\n", + " Similarity Score: 0.0648\n", + "\n", + "2. ID: doc_2\n", + " Similarity Score: -0.0071\n", + "\n", + "3. ID: doc_1\n", + " Similarity Score: -0.0293\n", + "\n" + ] + } + ], "source": [ "# Test similarity search\n", "query_vector = create_mock_embedding() # Random query vector for testing\n", @@ -363,9 +443,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "โœ… Found document by ID:\n", + " ID: doc_2\n" + ] + } + ], "source": [ "# Test search by ID\n", "try:\n", @@ -382,24 +471,39 @@ "source": [ "## Step 6: Configuration for GraphRAG\n", "\n", - "Now let's see how you would configure GraphRAG to use your custom vector store in a settings file." + "Now let's see how you would configure GraphRAG to use your custom vector store in a settings file.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿ“„ Example settings.yml configuration:\n", + "========================================\n", + "models:\n", + " default_embedding_model:\n", + " model: text-embedding-3-small\n", + " model_provider: openai\n", + " type: embedding\n", + "vector_store:\n", + " custom_config_option: example_value\n", + " type: simple_memory\n", + "\n" + ] + } + ], "source": [ "# Example GraphRAG yaml settings\n", "example_settings = {\n", " \"vector_store\": {\n", - " \"default_vector_store\": {\n", - " \"type\": CUSTOM_VECTOR_STORE_TYPE, # \"simple_memory\"\n", - " \"collection_name\": \"graphrag_entities\",\n", - " # Add any custom parameters your vector store needs\n", - " \"custom_parameter\": \"custom_value\",\n", - " }\n", + " \"type\": CUSTOM_VECTOR_STORE_TYPE, # \"simple_memory\"\n", + " # Add any custom parameters your vector store needs\n", + " \"custom_config_option\": \"example_value\",\n", " },\n", " # Other GraphRAG configuration...\n", " \"models\": {\n", @@ -425,14 +529,35 @@ "source": [ "## Step 7: Integration with GraphRAG Pipeline\n", "\n", - "Here's how your custom vector store would be used in a typical GraphRAG pipeline." + "Here's how your custom vector store would be used in a typical GraphRAG pipeline.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 37, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿš€ Simulating GraphRAG pipeline with custom vector store...\n", + "\n", + "Connecting to in-memory vector store...\n", + "Connected successfully!\n", + "Creating index: graphrag_entities\n", + "Index created successfully!\n", + "โœ… Step 1: Vector store created and connected\n", + "Loading 10 documents...\n", + "Successfully loaded 10 documents!\n", + "โœ… Step 2: Loaded 10 entity documents\n", + "โœ… Step 3: Found 5 relevant entities for query\n", + "โœ… Step 4: Context built using retrieved entities\n", + "\n", + "๐ŸŽฏ Retrieved 5 entities for context building\n" + ] + } + ], "source": [ "# Example of how GraphRAG would use your custom vector store\n", "def simulate_graphrag_pipeline():\n", @@ -442,9 +567,12 @@ " # 1. GraphRAG creates vector store using factory\n", " schema = IndexSchema(index_name=\"graphrag_entities\")\n", "\n", - " store = VectorStoreFactory().create(\n", - " CUSTOM_VECTOR_STORE_TYPE,\n", - " {\"index_schema\": schema, \"similarity_threshold\": 0.3},\n", + " store = create_vector_store(\n", + " VectorStoreConfig(\n", + " type=CUSTOM_VECTOR_STORE_TYPE,\n", + " custom_config_option=\"example_value\", # type: ignore\n", + " ),\n", + " schema,\n", " )\n", " store.connect()\n", " store.create_index()\n", @@ -472,7 +600,6 @@ " context_entities = [result.document for result in relevant_entities]\n", "\n", " print(\"โœ… Step 4: Context built using retrieved entities\")\n", - " print(f\"๐Ÿ“Š Final stats: {store.get_stats()}\")\n", "\n", " return context_entities\n", "\n", @@ -488,14 +615,42 @@ "source": [ "## Step 8: Testing and Validation\n", "\n", - "Let's create a comprehensive test suite to ensure our vector store works correctly." + "Let's create a comprehensive test suite to ensure our vector store works correctly.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 38, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "๐Ÿงช Running comprehensive vector store tests...\n", + "\n", + "Test 1: Basic functionality\n", + "Connecting to in-memory vector store...\n", + "Connected successfully!\n", + "Creating index: test_collection\n", + "Index created successfully!\n", + "Loading 2 documents...\n", + "Successfully loaded 2 documents!\n", + "โœ… Basic functionality test passed\n", + "\n", + "Test 2: Search functionality\n", + "โœ… Search functionality test passed\n", + "\n", + "Test 3: Search by ID\n", + "โœ… Search by ID test passed\n", + "\n", + "Test 5: Error handling\n", + "โœ… Error handling test passed\n", + "\n", + "๐ŸŽ‰ All tests passed! Your custom vector store is working correctly.\n" + ] + } + ], "source": [ "def test_custom_vector_store():\n", " \"\"\"Comprehensive test suite for the custom vector store.\"\"\"\n", @@ -503,9 +658,12 @@ "\n", " # Test 1: Basic functionality\n", " print(\"Test 1: Basic functionality\")\n", - " store = VectorStoreFactory().create(\n", - " CUSTOM_VECTOR_STORE_TYPE,\n", - " {\"index_schema\": IndexSchema(index_name=\"test\")},\n", + " store = create_vector_store(\n", + " VectorStoreConfig(\n", + " type=CUSTOM_VECTOR_STORE_TYPE,\n", + " custom_config_option=\"example_value\", # type: ignore\n", + " ),\n", + " schema,\n", " )\n", " store.connect()\n", " store.create_index()\n", @@ -546,9 +704,12 @@ "\n", " # Test 4: Error handling\n", " print(\"\\nTest 5: Error handling\")\n", - " disconnected_store = VectorStoreFactory().create(\n", - " CUSTOM_VECTOR_STORE_TYPE,\n", - " {\"index_schema\": IndexSchema(index_name=\"test2\")},\n", + " disconnected_store = create_vector_store(\n", + " VectorStoreConfig(\n", + " type=CUSTOM_VECTOR_STORE_TYPE,\n", + " custom_config_option=\"example_value\", # type: ignore\n", + " ),\n", + " IndexSchema(index_name=\"test2\"),\n", " )\n", "\n", " try:\n", @@ -581,19 +742,23 @@ "Congratulations! You've successfully learned how to implement and register a custom vector store with GraphRAG. Here's what you accomplished:\n", "\n", "### What You Built\n", + "\n", "- โœ… **Custom Vector Store Class**: Implemented `SimpleInMemoryVectorStore` with all required methods\n", "- โœ… **Factory Integration**: Registered your vector store with `VectorStoreFactory`\n", "- โœ… **Comprehensive Testing**: Validated functionality with a full test suite\n", "- โœ… **Configuration Examples**: Learned how to configure GraphRAG to use your vector store\n", "\n", "### Key Takeaways\n", + "\n", "1. **Interface Compliance**: Always implement all methods from `VectorStore`\n", "2. **Factory Pattern**: Use `VectorStoreFactory.register()` to make your vector store available\n", "3. **Testing**: Validate your implementation thoroughly before production use\n", "4. **Configuration**: Use YAML or environment variables for flexible configuration\n", "\n", "### Production Considerations\n", + "\n", "For production use, consider:\n", + "\n", "- **Persistence**: Add data persistence mechanisms\n", "- **Scalability**: Use optimized vector search libraries (FAISS, HNSW)\n", "- **Error Handling**: Implement robust error handling and logging\n", @@ -602,11 +767,12 @@ "- **Monitoring**: Add metrics and health checks\n", "\n", "### Resources\n", + "\n", "- [GraphRAG Documentation](https://microsoft.github.io/graphrag/)\n", "- [Vector Store Examples](https://github.com/microsoft/graphrag/tree/main/packages/graphrag-vectors)\n", "- [GraphRAG GitHub Repository](https://github.com/microsoft/graphrag)\n", "\n", - "Happy building! ๐Ÿš€" + "Happy building! ๐Ÿš€\n" ] } ], @@ -626,7 +792,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.10" + "version": "3.12.4" } }, "nbformat": 4,