Skip to content

Commit

Permalink
Merge branch 'main' into enhancement/conversation-loading-state
Browse files Browse the repository at this point in the history
  • Loading branch information
JeevaRamanathan authored Oct 22, 2024
2 parents 6a024b0 + 8862375 commit 2e69e9b
Show file tree
Hide file tree
Showing 25 changed files with 577 additions and 119 deletions.
4 changes: 4 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,7 @@ updates:
directory: "/frontend" # Location of package manifests
schedule:
interval: "weekly"
- package-ecosystem: "github-actions"
directory: "/"
schedule:
interval: "weekly"
14 changes: 10 additions & 4 deletions .github/holopin.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
organization: arc53
defaultSticker: clqmdf0ed34290glbvqh0kzxd
organization: docsgpt
defaultSticker: cm1ulwkkl180570cl82rtzympu
stickers:
- id: clqmdf0ed34290glbvqh0kzxd
alias: festive
- id: cm1ulwkkl180570cl82rtzympu
alias: contributor2024
- id: cm1ureg8o130450cl8c1po6mil
alias: api
- id: cm1urhmag148240cl8yvqxkthx
alias: lpc
- id: cm1urlcpq622090cl2tvu4w71y
alias: lexeu
8 changes: 4 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,22 @@ jobs:
contents: read
packages: write
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Set up QEMU
uses: docker/setup-qemu-action@v1

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v3

- name: Login to DockerHub
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Login to ghcr.io
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/cife.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,22 @@ jobs:
contents: read
packages: write
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Set up QEMU
uses: docker/setup-qemu-action@v1

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v3

- name: Login to DockerHub
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Login to ghcr.io
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/docker-develop-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,22 @@ jobs:
contents: read
packages: write
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Set up QEMU
uses: docker/setup-qemu-action@v1

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v3

- name: Login to DockerHub
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Login to ghcr.io
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/docker-develop-fe-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,22 @@ jobs:
contents: read
packages: write
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Set up QEMU
uses: docker/setup-qemu-action@v1

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
uses: docker/setup-buildx-action@v3

- name: Login to DockerHub
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}

- name: Login to ghcr.io
uses: docker/login-action@v2
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
ruff:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4

- name: Lint with Ruff
uses: chartboost/ruff-action@v1
6 changes: 3 additions & 3 deletions .github/workflows/pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,9 @@ jobs:
matrix:
python-version: ["3.11"]
steps:
- uses: actions/checkout@v3
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
Expand All @@ -24,7 +24,7 @@ jobs:
python -m pytest --cov=application --cov-report=xml
- name: Upload coverage reports to Codecov
if: github.event_name == 'pull_request' && matrix.python-version == '3.11'
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v4
env:
CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}

2 changes: 1 addition & 1 deletion .github/workflows/sync_fork.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
steps:
# Step 1: run a standard checkout action
- name: Checkout target repo
uses: actions/checkout@v3
uses: actions/checkout@v4

# Step 2: run the sync action
- name: Sync upstream changes
Expand Down
4 changes: 2 additions & 2 deletions application/api/user/routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -939,7 +939,7 @@ def post(self):
"conversation_id": DBRef(
"conversations", ObjectId(conversation_id)
),
"isPromptable": not is_promptable,
"isPromptable": is_promptable,
"first_n_queries": current_n_queries,
"user": user,
}
Expand All @@ -962,7 +962,7 @@ def post(self):
"$ref": "conversations",
"$id": ObjectId(conversation_id),
},
"isPromptable": not is_promptable,
"isPromptable": is_promptable,
"first_n_queries": current_n_queries,
"user": user,
}
Expand Down
5 changes: 4 additions & 1 deletion application/core/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class Settings(BaseSettings):
DEFAULT_MAX_HISTORY: int = 150
MODEL_TOKEN_LIMITS: dict = {"gpt-3.5-turbo": 4096, "claude-2": 1e5}
UPLOAD_FOLDER: str = "inputs"
VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus"
VECTOR_STORE: str = "faiss" # "faiss" or "elasticsearch" or "qdrant" or "milvus" or "lancedb"
RETRIEVERS_ENABLED: list = ["classic_rag", "duckduck_search"] # also brave_search

# LLM Cache
Expand Down Expand Up @@ -70,6 +70,9 @@ class Settings(BaseSettings):
MILVUS_URI: Optional[str] = "./milvus_local.db" # milvus lite version as default
MILVUS_TOKEN: Optional[str] = ""

# LanceDB vectorstore config
LANCEDB_PATH: str = "/tmp/lancedb" # Path where LanceDB stores its local data
LANCEDB_TABLE_NAME: Optional[str] = "docsgpts" # Name of the table to use for storing vectors
BRAVE_SEARCH_API_KEY: Optional[str] = None

FLASK_DEBUG_MODE: bool = False
Expand Down
119 changes: 119 additions & 0 deletions application/vectorstore/lancedb.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
from typing import List, Optional
import importlib
from application.vectorstore.base import BaseVectorStore
from application.core.settings import settings

class LanceDBVectorStore(BaseVectorStore):
"""Class for LanceDB Vector Store integration."""

def __init__(self, path: str = settings.LANCEDB_PATH,
table_name_prefix: str = settings.LANCEDB_TABLE_NAME,
source_id: str = None,
embeddings_key: str = "embeddings"):
"""Initialize the LanceDB vector store."""
super().__init__()
self.path = path
self.table_name = f"{table_name_prefix}_{source_id}" if source_id else table_name_prefix
self.embeddings_key = embeddings_key
self._lance_db = None
self.docsearch = None
self._pa = None # PyArrow (pa) will be lazy loaded

@property
def pa(self):
"""Lazy load pyarrow module."""
if self._pa is None:
self._pa = importlib.import_module("pyarrow")
return self._pa

@property
def lancedb(self):
"""Lazy load lancedb module."""
if not hasattr(self, "_lancedb_module"):
self._lancedb_module = importlib.import_module("lancedb")
return self._lancedb_module

@property
def lance_db(self):
"""Lazy load the LanceDB connection."""
if self._lance_db is None:
self._lance_db = self.lancedb.connect(self.path)
return self._lance_db

@property
def table(self):
"""Lazy load the LanceDB table."""
if self.docsearch is None:
if self.table_name in self.lance_db.table_names():
self.docsearch = self.lance_db.open_table(self.table_name)
else:
self.docsearch = None
return self.docsearch

def ensure_table_exists(self):
"""Ensure the table exists before performing operations."""
if self.table is None:
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key)
schema = self.pa.schema([
self.pa.field("vector", self.pa.list_(self.pa.float32(), list_size=embeddings.dimension)),
self.pa.field("text", self.pa.string()),
self.pa.field("metadata", self.pa.struct([
self.pa.field("key", self.pa.string()),
self.pa.field("value", self.pa.string())
]))
])
self.docsearch = self.lance_db.create_table(self.table_name, schema=schema)

def add_texts(self, texts: List[str], metadatas: Optional[List[dict]] = None, source_id: str = None):
"""Add texts with metadata and their embeddings to the LanceDB table."""
embeddings = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_documents(texts)
vectors = []
for embedding, text, metadata in zip(embeddings, texts, metadatas or [{}] * len(texts)):
if source_id:
metadata["source_id"] = source_id
metadata_struct = [{"key": k, "value": str(v)} for k, v in metadata.items()]
vectors.append({
"vector": embedding,
"text": text,
"metadata": metadata_struct
})
self.ensure_table_exists()
self.docsearch.add(vectors)

def search(self, query: str, k: int = 2, *args, **kwargs):
"""Search LanceDB for the top k most similar vectors."""
self.ensure_table_exists()
query_embedding = self._get_embeddings(settings.EMBEDDINGS_NAME, self.embeddings_key).embed_query(query)
results = self.docsearch.search(query_embedding).limit(k).to_list()
return [(result["_distance"], result["text"], result["metadata"]) for result in results]

def delete_index(self):
"""Delete the entire LanceDB index (table)."""
if self.table:
self.lance_db.drop_table(self.table_name)

def assert_embedding_dimensions(self, embeddings):
"""Ensure that embedding dimensions match the table index dimensions."""
word_embedding_dimension = embeddings.dimension
if self.table:
table_index_dimension = len(self.docsearch.schema["vector"].type.value_type)
if word_embedding_dimension != table_index_dimension:
raise ValueError(
f"Embedding dimension mismatch: embeddings.dimension ({word_embedding_dimension}) "
f"!= table index dimension ({table_index_dimension})"
)

def filter_documents(self, filter_condition: dict) -> List[dict]:
"""Filter documents based on certain conditions."""
self.ensure_table_exists()

# Ensure source_id exists in the filter condition
if 'source_id' not in filter_condition:
raise ValueError("filter_condition must contain 'source_id'")

source_id = filter_condition["source_id"]

# Use LanceDB's native filtering if supported, otherwise filter manually
filtered_data = self.docsearch.filter(lambda x: x.metadata and x.metadata.get("source_id") == source_id).to_list()

return filtered_data
Loading

0 comments on commit 2e69e9b

Please sign in to comment.