Merge branch 'main' into enhancement/conversation-loading-state

arc53 · Oct 22, 2024 · 2e69e9b · 2e69e9b
2 parents 6a024b0 + 8862375
commit 2e69e9b
Show file tree

Hide file tree

Showing 25 changed files with 577 additions and 119 deletions.
diff --git a/.github/dependabot.yml b/.github/dependabot.yml
@@ -13,3 +13,7 @@ updates:
  directory: "/frontend" # Location of package manifests
  schedule:
  interval: "weekly"
+ - package-ecosystem: "github-actions"
+ directory: "/"
+ schedule:
+ interval: "weekly"
diff --git a/.github/holopin.yml b/.github/holopin.yml
@@ -1,5 +1,11 @@
-organization: arc53
-defaultSticker: clqmdf0ed34290glbvqh0kzxd
+organization: docsgpt
+defaultSticker: cm1ulwkkl180570cl82rtzympu
 stickers:
- - id: clqmdf0ed34290glbvqh0kzxd
- alias: festive
+ - id: cm1ulwkkl180570cl82rtzympu
+ alias: contributor2024
+ - id: cm1ureg8o130450cl8c1po6mil
+ alias: api
+ - id: cm1urhmag148240cl8yvqxkthx
+ alias: lpc
+ - id: cm1urlcpq622090cl2tvu4w71y
+ alias: lexeu
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -12,22 +12,22 @@ jobs:
  contents: read
  packages: write
  steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
 
  - name: Set up QEMU
  uses: docker/setup-qemu-action@v1
 
  - name: Set up Docker Buildx
- uses: docker/setup-buildx-action@v1
+ uses: docker/setup-buildx-action@v3
 
  - name: Login to DockerHub
- uses: docker/login-action@v2
+ uses: docker/login-action@v3
  with:
  username: ${{ secrets.DOCKER_USERNAME }}
  password: ${{ secrets.DOCKER_PASSWORD }}
 
  - name: Login to ghcr.io
- uses: docker/login-action@v2
+ uses: docker/login-action@v3
  with:
  registry: ghcr.io
  username: ${{ github.repository_owner }}

diff --git a/.github/workflows/cife.yml b/.github/workflows/cife.yml
@@ -12,22 +12,22 @@ jobs:
  contents: read
  packages: write
  steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
 
  - name: Set up QEMU
  uses: docker/setup-qemu-action@v1
 
  - name: Set up Docker Buildx
- uses: docker/setup-buildx-action@v1
+ uses: docker/setup-buildx-action@v3
 
  - name: Login to DockerHub
- uses: docker/login-action@v2
+ uses: docker/login-action@v3
  with:
  username: ${{ secrets.DOCKER_USERNAME }}
  password: ${{ secrets.DOCKER_PASSWORD }}
 
  - name: Login to ghcr.io
- uses: docker/login-action@v2
+ uses: docker/login-action@v3
  with:
  registry: ghcr.io
  username: ${{ github.repository_owner }}

diff --git a/.github/workflows/docker-develop-build.yml b/.github/workflows/docker-develop-build.yml
@@ -14,22 +14,22 @@ jobs:
  contents: read
  packages: write
  steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
 
  - name: Set up QEMU
  uses: docker/setup-qemu-action@v1
 
  - name: Set up Docker Buildx
- uses: docker/setup-buildx-action@v1
+ uses: docker/setup-buildx-action@v3
 
  - name: Login to DockerHub
- uses: docker/login-action@v2
+ uses: docker/login-action@v3
  with:
  username: ${{ secrets.DOCKER_USERNAME }}
  password: ${{ secrets.DOCKER_PASSWORD }}
 
  - name: Login to ghcr.io
- uses: docker/login-action@v2
+ uses: docker/login-action@v3
  with:
  registry: ghcr.io
  username: ${{ github.repository_owner }}

diff --git a/.github/workflows/docker-develop-fe-build.yml b/.github/workflows/docker-develop-fe-build.yml
@@ -14,22 +14,22 @@ jobs:
  contents: read
  packages: write
  steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
 
  - name: Set up QEMU
  uses: docker/setup-qemu-action@v1
 
  - name: Set up Docker Buildx
- uses: docker/setup-buildx-action@v1
+ uses: docker/setup-buildx-action@v3
 
  - name: Login to DockerHub
- uses: docker/login-action@v2
+ uses: docker/login-action@v3
  with:
  username: ${{ secrets.DOCKER_USERNAME }}
  password: ${{ secrets.DOCKER_PASSWORD }}
 
  - name: Login to ghcr.io
- uses: docker/login-action@v2
+ uses: docker/login-action@v3
  with:
  registry: ghcr.io
  username: ${{ github.repository_owner }}

diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -11,7 +11,7 @@ jobs:
  ruff:
  runs-on: ubuntu-latest
  steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
 
  - name: Lint with Ruff
  uses: chartboost/ruff-action@v1
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -8,9 +8,9 @@ jobs:
  matrix:
  python-version: ["3.11"]
  steps:
- - uses: actions/checkout@v3
+ - uses: actions/checkout@v4
  - name: Set up Python ${{ matrix.python-version }}
- uses: actions/setup-python@v4
+ uses: actions/setup-python@v5
  with:
  python-version: ${{ matrix.python-version }}
  - name: Install dependencies
@@ -24,7 +24,7 @@ jobs:
  python -m pytest --cov=application --cov-report=xml
  - name: Upload coverage reports to Codecov
  if: github.event_name == 'pull_request' && matrix.python-version == '3.11'
- uses: codecov/codecov-action@v3
+ uses: codecov/codecov-action@v4
  env:
  CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
 
diff --git a/.github/workflows/sync_fork.yaml b/.github/workflows/sync_fork.yaml
@@ -17,7 +17,7 @@ jobs:
  steps:
  # Step 1: run a standard checkout action
  - name: Checkout target repo
- uses: actions/checkout@v3
+ uses: actions/checkout@v4
 
  # Step 2: run the sync action
  - name: Sync upstream changes

diff --git a/application/api/user/routes.py b/application/api/user/routes.py
@@ -939,7 +939,7 @@ def post(self):
  "conversation_id": DBRef(
  "conversations", ObjectId(conversation_id)
  ),
- "isPromptable": not is_promptable,
+ "isPromptable": is_promptable,
  "first_n_queries": current_n_queries,
  "user": user,
  }
@@ -962,7 +962,7 @@ def post(self):
  "$ref": "conversations",
  "$id": ObjectId(conversation_id),
  },
- "isPromptable": not is_promptable,
+ "isPromptable": is_promptable,
  "first_n_queries": current_n_queries,
  "user": user,
  }

diff --git a/application/core/settings.py b/application/core/settings.py
@@ -18,7 +18,7 @@ class Settings(BaseSettings):
  DEFAULT_MAX_HISTORY: int = 150
  MODEL_TOKEN_LIMITS: dict = {"gpt-3.5-turbo": 4096, "claude-2": 1e5}
  UPLOAD_FOLDER: str = "inputs"
- VECTOR_STORE: str = "faiss"  # "faiss" or "elasticsearch" or "qdrant" or "milvus"
+ VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb"
  RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search
 
  # LLM Cache
@@ -70,6 +70,9 @@ class Settings(BaseSettings):
  MILVUS_URI: Optional[str] = "./milvus_local.db" # milvus lite version as default
  MILVUS_TOKEN: Optional[str] = ""
 
+ # LanceDB vectorstore config
+ LANCEDB_PATH: str = "/tmp/lancedb" # Path where LanceDB stores its local data
+ LANCEDB_TABLE_NAME: Optional[str] = "docsgpts" # Name of the table to use for storing vectors
  BRAVE_SEARCH_API_KEY: Optional[str] = None
 
  FLASK_DEBUG_MODE: bool = False

diff --git a/application/vectorstore/lancedb.py b/application/vectorstore/lancedb.py
@@ -0,0 +1,119 @@
+from typing import List, Optional
+import importlib
+from application.vectorstore.base import BaseVectorStore
+from application.core.settings import settings
+
+class LanceDBVectorStore(BaseVectorStore):
+ """Class for LanceDB Vector Store integration."""
+
+ def __init__(self, path: str = settings.LANCEDB_PATH,
+ table_name_prefix: str = settings.LANCEDB_TABLE_NAME,
+ source_id: str = None,
+ embeddings_key: str = "embeddings"):
+ """Initialize the LanceDB vector store."""
+ super().__init__()
+ self.path = path
+ self.table_name = f"{table_name_prefix}_{source_id}" if source_id else table_name_prefix
+ self.embeddings_key = embeddings_key
+ self._lance_db = None
+ self.docsearch = None
+ self._pa = None # PyArrow (pa) will be lazy loaded
+
+ @property
+ def pa(self):
+ """Lazy load pyarrow module."""
+ if self._pa is None:
+ self._pa = importlib.import_module("pyarrow")
+ return self._pa
+
+ @property
+ def lancedb(self):
+ """Lazy load lancedb module."""
+ if not hasattr(self, "_lancedb_module"):
+ self._lancedb_module = importlib.import_module("lancedb")
+ return self._lancedb_module
+
+ @property
+ def lance_db(self):
+ """Lazy load the LanceDB connection."""
+ if self._lance_db is None:
+ self._lance_db = self.lancedb.connect(self.path)
+ return self._lance_db
+
+ @property
+ def table(self):
+ """Lazy load the LanceDB table."""
+ if self.docsearch is None:
+ if self.table_name in self.lance_db.table_names():
+ self.docsearch = self.lance_db.open_table(self.table_name)
+ else:
+ self.docsearch = None
+ return self.docsearch
+
+ def ensure_table_exists(self):
+ """Ensure the table exists before performing operations."""
+ if self.table is None:
+ embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
+ schema = self.pa.schema([
+ self.pa.field("vector", self.pa.list_(self.pa.float32(), list_size=embeddings.dimension)),
+ self.pa.field("text", self.pa.string()),
+ self.pa.field("metadata", self.pa.struct([
+ self.pa.field("key", self.pa.string()),
+ self.pa.field("value", self.pa.string())
+ ]))
+ ])
+ self.docsearch = self.lance_db.create_table(self.table_name, schema=schema)
+
+ def add_texts(self, texts: List[str], metadatas: Optional[List[dict]] = None, source_id: str = None):
+ """Add texts with metadata and their embeddings to the LanceDB table."""
+ embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_documents(texts)
+ vectors = []
+ for embedding, text, metadata in zip(embeddings, texts, metadatas or [{}] * len(texts)):
+ if source_id:
+ metadata["source_id"] = source_id
+ metadata_struct = [{"key": k, "value": str(v)} for k, v in metadata.items()]
+ vectors.append({
+ "vector": embedding,
+ "text": text,
+ "metadata": metadata_struct
+ })
+ self.ensure_table_exists()
+ self.docsearch.add(vectors)
+
+ def search(self, query: str, k: int = 2, *args, **kwargs):
+ """Search LanceDB for the top k most similar vectors."""
+ self.ensure_table_exists()
+ query_embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_query(query)
+ results = self.docsearch.search(query_embedding).limit(k).to_list()
+ return [(result["_distance"], result["text"], result["metadata"]) for result in results]
+
+ def delete_index(self):
+ """Delete the entire LanceDB index (table)."""
+ if self.table:
+ self.lance_db.drop_table(self.table_name)
+
+ def assert_embedding_dimensions(self, embeddings):
+ """Ensure that embedding dimensions match the table index dimensions."""
+ word_embedding_dimension = embeddings.dimension
+ if self.table:
+ table_index_dimension = len(self.docsearch.schema["vector"].type.value_type)
+ if word_embedding_dimension != table_index_dimension:
+ raise ValueError(
+ f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) "
+ f"!= table index dimension ({table_index_dimension})"
+ )
+
+ def filter_documents(self, filter_condition: dict) -> List[dict]:
+ """Filter documents based on certain conditions."""
+ self.ensure_table_exists()
+
+ # Ensure source_id exists in the filter condition
+ if 'source_id' not in filter_condition:
+ raise ValueError("filter_condition must contain 'source_id'")
+
+ source_id = filter_condition["source_id"]
+
+ # Use LanceDB's native filtering if supported, otherwise filter manually
+ filtered_data = self.docsearch.filter(lambda x: x.metadata and x.metadata.get("source_id") == source_id).to_list()
+
+ return filtered_data