feat(nodestore): add Postgres for the doc and index store (#1706)

dbzoo · web-flow · commit 68b3a34b032a · 2024-03-14T17:12:33.000+01:00
* Adding Postgres for the doc and index store

* Adding documentation.  Rename postgres database local-&gt;simple.  Postgres storage dependencies

* Update documentation for postgres storage

* Renaming feature to nodestore

* update docstore -&gt; nodestore in doc

* missed some docstore changes in doc

* Updated poetry.lock

* Formatting updates to pass ruff/black checks

* Correction to unreachable code!

* Format adjustment to pass black test

* Adjust extra inclusion name for vector pg

* extra dep change for pg vector

* storage-postgres -&gt; storage-nodestore-postgres

* Hash change on poetry lock
diff --git a/fern/docs.yml b/fern/docs.yml
@@ -58,6 +58,8 @@ navigation:
         contents:
           - page: Vector Stores
             path: ./docs/pages/manual/vectordb.mdx
+          - page: Node Stores
+            path: ./docs/pages/manual/nodestore.mdx
       - section: Advanced Setup
         contents:
           - page: LLM Backends
diff --git a/fern/docs/pages/manual/nodestore.mdx b/fern/docs/pages/manual/nodestore.mdx
@@ -0,0 +1,66 @@
+## NodeStores
+PrivateGPT supports **Simple** and [Postgres](https://www.postgresql.org/) providers. Simple being the default.
+
+In order to select one or the other, set the `nodestore.database` property in the `settings.yaml` file to `simple` or `postgres`.
+
+```yaml
+nodestore:
+  database: simple
+```
+
+### Simple Document Store
+
+Setting up simple document store: Persist data with in-memory and disk storage.
+
+Enabling the simple document store is an excellent choice for small projects or proofs of concept where you need to persist data while maintaining minimal setup complexity. To get started, set the nodestore.database property in your settings.yaml file as follows:
+
+```yaml
+nodestore:
+  database: simple
+```
+The beauty of the simple document store is its flexibility and ease of implementation. It provides a solid foundation for managing and retrieving data without the need for complex setup or configuration. The combination of in-memory processing and disk persistence ensures that you can efficiently handle small to medium-sized datasets while maintaining data consistency across runs.
+
+### Postgres Document Store
+
+To enable Postgres, set the `nodestore.database` property in the `settings.yaml` file to `postgres` and install the `storage-nodestore-postgres` extra.  Note: Vector Embeddings Storage in Postgres is configured separately
+
+```bash
+poetry install --extras storage-nodestore-postgres
+```
+
+The available configuration options are:
+| Field         | Description                                               |
+|---------------|-----------------------------------------------------------|
+| **host**      | The server hosting the Postgres database. Default is `localhost` |
+| **port**      | The port on which the Postgres database is accessible. Default is `5432` |
+| **database**  | The specific database to connect to. Default is `postgres` |
+| **user**      | The username for database access. Default is `postgres` |
+| **password**  | The password for database access. (Required)            |
+| **schema_name** | The database schema to use. Default is `private_gpt`       |
+
+For example:
+```yaml
+nodestore:
+  database: postgres
+
+postgres:
+  host: localhost
+  port: 5432
+  database: postgres
+  user: postgres
+  password: <PASSWORD>
+  schema_name: private_gpt
+```
+
+Given the above configuration, Two PostgreSQL tables will be created upon successful connection: one for storing metadata related to the index and another for document data itself.
+
+```
+postgres=# \dt private_gpt.*
+                  List of relations
+   Schema    |      Name       | Type  |    Owner     
+-------------+-----------------+-------+--------------
+ private_gpt | data_docstore   | table | postgres
+ private_gpt | data_indexstore | table | postgres
+
+postgres=# 
+```
diff --git a/fern/docs/pages/manual/vectordb.mdx b/fern/docs/pages/manual/vectordb.mdx
@@ -51,10 +51,10 @@ By default `chroma` will use a disk-based database stored in local_data_path / "
 
 ### PGVector
 
-To enable PGVector, set the `vectorstore.database` property in the `settings.yaml` file to `pgvector` and install the `pgvector` extra.
+To enable PGVector, set the `vectorstore.database` property in the `settings.yaml` file to `pgvector` and install the `vector-stores-postgres` extra.
 
 ```bash
-poetry install --extras pgvector
+poetry install --extras vector-stores-postgres
 ```
 
 PGVector settings can be configured by setting values to the `pgvector` property in the `settings.yaml` file.
diff --git a/poetry.lock b/poetry.lock
diff --git a/private_gpt/components/node_store/node_store_component.py b/private_gpt/components/node_store/node_store_component.py
@@ -6,6 +6,7 @@
 from llama_index.core.storage.index_store.types import BaseIndexStore
 
 from private_gpt.paths import local_data_path
+from private_gpt.settings.settings import Settings
 
 logger = logging.getLogger(__name__)
 
@@ -16,19 +17,51 @@ class NodeStoreComponent:
     doc_store: BaseDocumentStore
 
     @inject
-    def __init__(self) -> None:
-        try:
-            self.index_store = SimpleIndexStore.from_persist_dir(
-                persist_dir=str(local_data_path)
-            )
-        except FileNotFoundError:
-            logger.debug("Local index store not found, creating a new one")
-            self.index_store = SimpleIndexStore()
-
-        try:
-            self.doc_store = SimpleDocumentStore.from_persist_dir(
-                persist_dir=str(local_data_path)
-            )
-        except FileNotFoundError:
-            logger.debug("Local document store not found, creating a new one")
-            self.doc_store = SimpleDocumentStore()
+    def __init__(self, settings: Settings) -> None:
+        match settings.nodestore.database:
+            case "simple":
+                try:
+                    self.index_store = SimpleIndexStore.from_persist_dir(
+                        persist_dir=str(local_data_path)
+                    )
+                except FileNotFoundError:
+                    logger.debug("Local index store not found, creating a new one")
+                    self.index_store = SimpleIndexStore()
+
+                try:
+                    self.doc_store = SimpleDocumentStore.from_persist_dir(
+                        persist_dir=str(local_data_path)
+                    )
+                except FileNotFoundError:
+                    logger.debug("Local document store not found, creating a new one")
+                    self.doc_store = SimpleDocumentStore()
+
+            case "postgres":
+                try:
+                    from llama_index.core.storage.docstore.postgres_docstore import (
+                        PostgresDocumentStore,
+                    )
+                    from llama_index.core.storage.index_store.postgres_index_store import (
+                        PostgresIndexStore,
+                    )
+                except ImportError:
+                    raise ImportError(
+                        "Postgres dependencies not found, install with `poetry install --extras storage-nodestore-postgres`"
+                    ) from None
+
+                if settings.postgres is None:
+                    raise ValueError("Postgres index/doc store settings not found.")
+
+                self.index_store = PostgresIndexStore.from_params(
+                    **settings.postgres.model_dump(exclude_none=True)
+                )
+                self.doc_store = PostgresDocumentStore.from_params(
+                    **settings.postgres.model_dump(exclude_none=True)
+                )
+
+            case _:
+                # Should be unreachable
+                # The settings validator should have caught this
+                raise ValueError(
+                    f"Database {settings.nodestore.database} not supported"
+                )
diff --git a/private_gpt/settings/settings.py b/private_gpt/settings/settings.py
@@ -108,6 +108,10 @@ class VectorstoreSettings(BaseModel):
     database: Literal["chroma", "qdrant", "pgvector"]
 
 
+class NodeStoreSettings(BaseModel):
+    database: Literal["simple", "postgres"]
+
+
 class LlamaCPPSettings(BaseModel):
     llm_hf_repo_id: str
     llm_hf_model_file: str
@@ -249,7 +253,7 @@ class UISettings(BaseModel):
     )
 
 
-class PGVectorSettings(BaseModel):
+class PostgresSettings(BaseModel):
     host: str = Field(
         "localhost",
         description="The server hosting the Postgres database",
@@ -270,14 +274,17 @@ class PGVectorSettings(BaseModel):
         "postgres",
         description="The database to use to connect to the Postgres database",
     )
+    schema_name: str = Field(
+        "public",
+        description="The name of the schema in the Postgres database to use",
+    )
+
+
+class PGVectorSettings(PostgresSettings):
     embed_dim: int = Field(
         384,
         description="The dimension of the embeddings stored in the Postgres database",
     )
-    schema_name: str = Field(
-        "public",
-        description="The name of the schema in the Postgres database where the embeddings are stored",
-    )
     table_name: str = Field(
         "embeddings",
         description="The name of the table in the Postgres database where the embeddings are stored",
@@ -350,7 +357,9 @@ class Settings(BaseModel):
     openai: OpenAISettings
     ollama: OllamaSettings
     vectorstore: VectorstoreSettings
+    nodestore: NodeStoreSettings
     qdrant: QdrantSettings | None = None
+    postgres: PostgresSettings | None = None
     pgvector: PGVectorSettings | None = None
 
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,6 +27,12 @@ llama-index-embeddings-openai = {version ="^0.1.6", optional = true}
 llama-index-vector-stores-qdrant = {version ="^0.1.3", optional = true}
 llama-index-vector-stores-chroma = {version ="^0.1.4", optional = true}
 llama-index-vector-stores-postgres = {version ="^0.1.2", optional = true}
+llama-index-storage-docstore-postgres = {version ="^0.1.2", optional = true}
+llama-index-storage-index-store-postgres = {version ="^0.1.2", optional = true}
+# Postgres
+psycopg2-binary = {version ="^2.9.9", optional = true}
+asyncpg = {version="^0.29.0", optional = true}
+
 # Optional Sagemaker dependency
 boto3 = {version ="^1.34.51", optional = true}
 # Optional UI
@@ -46,7 +52,7 @@ embeddings-sagemaker = ["boto3"]
 vector-stores-qdrant = ["llama-index-vector-stores-qdrant"]
 vector-stores-chroma = ["llama-index-vector-stores-chroma"]
 vector-stores-postgres = ["llama-index-vector-stores-postgres"]
-
+storage-nodestore-postgres = ["llama-index-storage-docstore-postgres","llama-index-storage-index-store-postgres","psycopg2-binary","asyncpg"]
 
 [tool.poetry.group.dev.dependencies]
 black = "^22"
diff --git a/settings-ollama-pg.yaml b/settings-ollama-pg.yaml
@@ -0,0 +1,43 @@
+# Using ollama and postgres for the vector, doc and index store. Ollama is also used for embeddings.
+# To use install these extras:
+# poetry install --extras "llms-ollama ui vector-stores-postgres embeddings-ollama storage-nodestore-postgres"
+server:
+  env_name: ${APP_ENV:ollama}
+
+llm:
+  mode: ollama
+  max_new_tokens: 512
+  context_window: 3900
+
+embedding:
+  mode: ollama
+
+ollama:
+  llm_model: mistral
+  embedding_model: nomic-embed-text
+  api_base: http://localhost:11434
+
+nodestore:
+  database: postgres
+
+vectorstore:
+  database: pgvector
+
+pgvector:
+  host: localhost
+  port: 5432
+  database: postgres
+  user: postgres
+  password: admin
+  embed_dim: 768
+  schema_name: private_gpt
+  table_name: embeddings
+
+postgres:
+  host: localhost
+  port: 5432
+  database: postgres
+  user: postgres
+  password: admin
+  schema_name: private_gpt
+
diff --git a/settings.yaml b/settings.yaml
@@ -62,6 +62,9 @@ huggingface:
 vectorstore:
   database: qdrant
 
+nodestore:
+  database: simple
+
 qdrant:
   path: local_data/private_gpt/qdrant
 
@@ -75,6 +78,14 @@ pgvector:
   schema_name: private_gpt
   table_name: embeddings
 
+postgres:
+  host: localhost
+  port: 5432
+  database: postgres
+  user: postgres
+  password: postgres
+  schema_name: private_gpt
+
 sagemaker:
   llm_endpoint_name: huggingface-pytorch-tgi-inference-2023-09-25-19-53-32-140
   embedding_endpoint_name: huggingface-pytorch-inference-2023-11-03-07-41-36-479