From ae40f93937eb53e9f16041c86bccfe0ce60e1d67 Mon Sep 17 00:00:00 2001 From: Arjun Bingly Date: Sun, 24 Mar 2024 17:52:21 -0400 Subject: [PATCH 1/3] MyPy ignore missing imports --- pyproject.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index f7c2d4f..3185030 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -116,3 +116,6 @@ docstring-code-format = true [tool.ruff.lint.pydocstyle] convention = "google" + +[tool.mypy] +ignore_missing_imports = true From 546d163e13417ffc8dd30dd280bb7fa4287d59f3 Mon Sep 17 00:00:00 2001 From: Arjun Bingly Date: Tue, 26 Mar 2024 19:01:33 -0400 Subject: [PATCH 2/3] Top_k bug multivec retriever --- src/grag/components/multivec_retriever.py | 38 +++++++++++------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/grag/components/multivec_retriever.py b/src/grag/components/multivec_retriever.py index 05478df..b7c8c2f 100644 --- a/src/grag/components/multivec_retriever.py +++ b/src/grag/components/multivec_retriever.py @@ -44,13 +44,13 @@ class Retriever: """ def __init__( - self, - vectordb: Optional[VectorDB] = None, - store_path: str = multivec_retriever_conf["store_path"], - id_key: str = multivec_retriever_conf["id_key"], - namespace: str = multivec_retriever_conf["namespace"], - top_k=1, - client_kwargs: Optional[Dict[str, Any]] = None, + self, + vectordb: Optional[VectorDB] = None, + store_path: str = multivec_retriever_conf["store_path"], + id_key: str = multivec_retriever_conf["id_key"], + namespace: str = multivec_retriever_conf["namespace"], + top_k=multivec_retriever_conf["top_k"], + client_kwargs: Optional[Dict[str, Any]] = None, ): """Initialize the Retriever. @@ -236,12 +236,12 @@ def get_docs_from_chunks(self, chunks: List[Document], one_to_one=False): return [d for d in docs if d is not None] def ingest( - self, - dir_path: Union[str, Path], - glob_pattern: str = "**/*.pdf", - dry_run: bool = False, - verbose: bool = True, - parser_kwargs: dict = None, + self, + dir_path: Union[str, Path], + glob_pattern: str = "**/*.pdf", + dry_run: bool = False, + verbose: bool = True, + parser_kwargs: dict = None, ): """Ingests the files in directory. @@ -278,12 +278,12 @@ def ingest( print(f"DRY RUN: found - {filepath.relative_to(dir_path)}") async def aingest( - self, - dir_path: Union[str, Path], - glob_pattern: str = "**/*.pdf", - dry_run: bool = False, - verbose: bool = True, - parser_kwargs: dict = None, + self, + dir_path: Union[str, Path], + glob_pattern: str = "**/*.pdf", + dry_run: bool = False, + verbose: bool = True, + parser_kwargs: dict = None, ): """Asynchronously ingests the files in directory. From f433944cdcb509348cdb8834d2edd80810f18a4b Mon Sep 17 00:00:00 2001 From: sanchitvj Date: Tue, 26 Mar 2024 20:52:04 -0400 Subject: [PATCH 3/3] coverage checking --- projects/Basic-RAG/BasicRAG_stuff.py | 1 + src/config.ini | 2 +- src/grag/components/multivec_retriever.py | 38 +++++++++++------------ src/tests/rag/basic_rag_test.py | 16 ++++++---- 4 files changed, 31 insertions(+), 26 deletions(-) diff --git a/projects/Basic-RAG/BasicRAG_stuff.py b/projects/Basic-RAG/BasicRAG_stuff.py index da95ec6..554c305 100644 --- a/projects/Basic-RAG/BasicRAG_stuff.py +++ b/projects/Basic-RAG/BasicRAG_stuff.py @@ -6,6 +6,7 @@ client = DeepLakeClient(collection_name="test") retriever = Retriever(vectordb=client) + rag = BasicRAG(doc_chain="stuff", retriever=retriever) if __name__ == "__main__": diff --git a/src/config.ini b/src/config.ini index e23c1b5..d55e002 100644 --- a/src/config.ini +++ b/src/config.ini @@ -1,5 +1,5 @@ [llm] -model_name : Llama-2-7b-chat +model_name : Llama-2-13b-chat # meta-llama/Llama-2-70b-chat-hf Mixtral-8x7B-Instruct-v0.1 quantization : Q5_K_M pipeline : llama_cpp diff --git a/src/grag/components/multivec_retriever.py b/src/grag/components/multivec_retriever.py index b7c8c2f..5a396fa 100644 --- a/src/grag/components/multivec_retriever.py +++ b/src/grag/components/multivec_retriever.py @@ -44,13 +44,13 @@ class Retriever: """ def __init__( - self, - vectordb: Optional[VectorDB] = None, - store_path: str = multivec_retriever_conf["store_path"], - id_key: str = multivec_retriever_conf["id_key"], - namespace: str = multivec_retriever_conf["namespace"], - top_k=multivec_retriever_conf["top_k"], - client_kwargs: Optional[Dict[str, Any]] = None, + self, + vectordb: Optional[VectorDB] = None, + store_path: str = multivec_retriever_conf["store_path"], + id_key: str = multivec_retriever_conf["id_key"], + namespace: str = multivec_retriever_conf["namespace"], + top_k=int(multivec_retriever_conf["top_k"]), + client_kwargs: Optional[Dict[str, Any]] = None, ): """Initialize the Retriever. @@ -236,12 +236,12 @@ def get_docs_from_chunks(self, chunks: List[Document], one_to_one=False): return [d for d in docs if d is not None] def ingest( - self, - dir_path: Union[str, Path], - glob_pattern: str = "**/*.pdf", - dry_run: bool = False, - verbose: bool = True, - parser_kwargs: dict = None, + self, + dir_path: Union[str, Path], + glob_pattern: str = "**/*.pdf", + dry_run: bool = False, + verbose: bool = True, + parser_kwargs: dict = None, ): """Ingests the files in directory. @@ -278,12 +278,12 @@ def ingest( print(f"DRY RUN: found - {filepath.relative_to(dir_path)}") async def aingest( - self, - dir_path: Union[str, Path], - glob_pattern: str = "**/*.pdf", - dry_run: bool = False, - verbose: bool = True, - parser_kwargs: dict = None, + self, + dir_path: Union[str, Path], + glob_pattern: str = "**/*.pdf", + dry_run: bool = False, + verbose: bool = True, + parser_kwargs: dict = None, ): """Asynchronously ingests the files in directory. diff --git a/src/tests/rag/basic_rag_test.py b/src/tests/rag/basic_rag_test.py index 2249028..b8c2ceb 100644 --- a/src/tests/rag/basic_rag_test.py +++ b/src/tests/rag/basic_rag_test.py @@ -1,11 +1,16 @@ -from typing import Text, List +from typing import List, Text +from grag.components.multivec_retriever import Retriever +from grag.components.vectordb.deeplake_client import DeepLakeClient from grag.rag.basic_rag import BasicRAG +client = DeepLakeClient(collection_name="test") +retriever = Retriever(vectordb=client) + def test_rag_stuff(): - rag = BasicRAG(doc_chain="stuff") - response, sources = rag("What is simulated annealing?") + rag = BasicRAG(doc_chain="stuff", retriever=retriever) + response, sources = rag("What is Flash Attention?") assert isinstance(response, Text) assert isinstance(sources, List) assert all(isinstance(s, str) for s in sources) @@ -13,9 +18,8 @@ def test_rag_stuff(): def test_rag_refine(): - rag = BasicRAG(doc_chain="refine") - response, sources = rag("What is simulated annealing?") - # assert isinstance(response, Text) + rag = BasicRAG(doc_chain="refine", retriever=retriever) + response, sources = rag("What is Flash Attention?") assert isinstance(response, List) assert all(isinstance(s, str) for s in response) assert isinstance(sources, List)