Merge pull request #103 from arjbingly/config-parsing

Config Wrapper
arjbingly · Apr 26, 2024 · 2e44f24 · 2e44f24
2 parents 2b8f8e5 + 145d35a
commit 2e44f24
Show file tree

Hide file tree

Showing 12 changed files with 245 additions and 132 deletions.
diff --git a/ci/branch_Jenkinsfile b/ci/branch_Jenkinsfile
@@ -8,8 +8,8 @@ pipeline {
         PYTHONPATH = "${env.WORKSPACE}/.venv/bin"
         CUDACXX = '/usr/local/cuda-12/bin/nvcc'
         CMAKE_ARGS = "-DLLAMA_CUBLAS=on"
-        PATH="/usr/local/cuda-12.3/bin:$PATH"
-        LD_LIBRARY_PATH="/usr/local/cuda-12.3/lib64:$LD_LIBRARY_PATH"
+        PATH="/usr/local/cuda-12/bin:$PATH"
+        LD_LIBRARY_PATH="/usr/local/cuda-12/lib64:$LD_LIBRARY_PATH"
         GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=accept-new"
         }
 

diff --git a/ci/modify_config.py b/ci/modify_config.py
@@ -14,5 +14,5 @@
 config['llm']['base_dir'] = f'{jenkins_home}/ci_test_models/models'
 config['env']['env_path'] = f'{jenkins_home}/env_file/.env'
 
-with open(f'{workspace}/src/config.ini', 'w') as configfile:
+with open(f'{workspace}/config.ini', 'w') as configfile:
     config.write(configfile)
diff --git a/src/config.ini → config.ini b/src/config.ini → config.ini
@@ -7,25 +7,25 @@ device_map : auto
 task : text-generation
 max_new_tokens : 1024
 temperature : 0.1
-n_batch_gpu_cpp : 1024
-n_ctx_cpp : 6000
-n_gpu_layers_cpp : -1
+n_batch : 1024
+n_ctx : 6000
+n_gpu_layers : -1
 # The number of layers to put on the GPU. Mixtral-18, gemma-20
 std_out : True
 base_dir : ${root:root_path}/models
 
-[chroma]
+[chroma_client]
 host : localhost
 port : 8000
 collection_name : arxiv
 # embedding_type : sentence-transformers
 # embedding_model : "all-mpnet-base-v2"
 embedding_type : instructor-embedding
 embedding_model : hkunlp/instructor-xl
-store_path : ${data:data_path}/vectordb
-allow_reset : True
+;store_path : ${data:data_path}/vectordb
+;allow_reset : True
 
-[deeplake]
+[deeplake_client]
 collection_name : arxiv
 # embedding_type : sentence-transformers
 # embedding_model : "all-mpnet-base-v2"
@@ -45,7 +45,7 @@ namespace : 8c9040b0b5cd4d7cbc2e737da1b24ebf
 id_key : doc_id
 top_k : 3
 
-[parser]
+[parse_pdf]
 single_text_out : True
 strategy : hi_res
 infer_table_structure : True

diff --git a/src/docs/get_started.config.rst b/src/docs/get_started.config.rst
@@ -0,0 +1,83 @@
+Configuration
+===============
+
+GRAG gives the user an option to use a config file, in the form of a ``config.ini``.
+The use of a config file streamlines the process of passing arguments to the various components in the code.
+
+File Resolution
+****************
+GRAG takes the closest ``config.ini`` to the file you run. This enables users to have multiple config files per project,
+if they require or just a single config file at the root.
+
+Config Format
+***************
+``config.ini`` files use the `INI file format <https://en.wikipedia.org/wiki/INI_file>`_.
+
+Each section in the config file is the name of the module and each key value pair is the arguments required by the
+class in the module.
+
+Example Config File
+*******************
+
+::
+
+    [llm]
+    model_name : Llama-2-13b-chat
+    quantization : Q5_K_M
+    pipeline : llama_cpp
+    device_map : auto
+    task : text-generation
+    max_new_tokens : 1024
+    temperature : 0.1
+    n_batch : 1024
+    n_ctx : 6000
+    n_gpu_layers : -1
+    std_out : True
+    base_dir : ${root:root_path}/models
+
+    [chroma_client]
+    host : localhost
+    port : 8000
+    collection_name : arxiv
+    embedding_type : instructor-embedding
+    embedding_model : hkunlp/instructor-xl
+    store_path : ${data:data_path}/vectordb
+    allow_reset : True
+
+    [deeplake_client]
+    collection_name : arxiv
+    embedding_type : instructor-embedding
+    embedding_model : hkunlp/instructor-xl
+    store_path : ${data:data_path}/vectordb
+
+    [text_splitter]
+    chunk_size : 5000
+    chunk_overlap : 400
+
+    [multivec_retriever]
+    store_path : ${data:data_path}/doc_store
+    namespace : 8c9040b0b5cd4d7cbc2e737da1b24ebf
+    id_key : doc_id
+    top_k : 3
+
+    [parse_pdf]
+    single_text_out : True
+    strategy : hi_res
+    infer_table_structure : True
+    extract_images : True
+    image_output_dir : None
+    add_captions_to_text : True
+    add_captions_to_blocks : True
+    table_as_html : True
+
+    [data]
+    data_path : ${root:root_path}/data
+
+    [env]
+    env_path : ${root:root_path}/.env
+
+    [root]
+    root_path : /home/ubuntu/Capstone_5
+
+    [quantize]
+    llama_cpp_path : ${root:root_path}
diff --git a/src/docs/get_started.rst b/src/docs/get_started.rst
@@ -8,4 +8,4 @@ Get Started
    get_started.parse_pdf
    get_started.llms
    get_started.vectordb
-
+   get_started.config
diff --git a/src/grag/components/llm.py b/src/grag/components/llm.py
@@ -5,6 +5,7 @@
 from typing import Optional, Union
 
 import torch
+from grag.components.utils import configure_args
 from langchain.callbacks.manager import CallbackManager
 from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
 from langchain_community.llms import LlamaCpp
@@ -16,13 +17,8 @@
     pipeline,
 )
 
-from .utils import get_config
-
-llm_conf = get_config(load_env=True)["llm"]
-
-print("CUDA: ", torch.cuda.is_available())
-
 
+@configure_args
 class LLM:
     """A class for managing and utilizing large language models (LLMs).
 
@@ -38,20 +34,20 @@ class LLM:
     """
 
     def __init__(
-        self,
-        model_name: str = llm_conf["model_name"],
-        device_map: str = llm_conf["device_map"],
-        task: str = llm_conf["task"],
-        max_new_tokens: str = llm_conf["max_new_tokens"],
-        temperature: str = llm_conf["temperature"],
-        n_batch: str = llm_conf["n_batch_gpu_cpp"],
-        n_ctx: str = llm_conf["n_ctx_cpp"],
-        n_gpu_layers: str = llm_conf["n_gpu_layers_cpp"],
-        std_out: Union[bool, str] = llm_conf["std_out"],
-        base_dir: str = llm_conf["base_dir"],
-        quantization: str = llm_conf["quantization"],
-        pipeline: str = llm_conf["pipeline"],
-        callbacks=None,
+            self,
+            model_name: str,
+            device_map: str,
+            task: str,
+            max_new_tokens: str,
+            temperature: str,
+            n_batch: str,
+            n_ctx: str,
+            n_gpu_layers: str,
+            std_out: Union[bool, str],
+            base_dir: str,
+            quantization: str,
+            pipeline: str,
+            callbacks=None,
     ):
         """Initialize the LLM class using the given parameters."""
         self.base_dir = Path(base_dir)
@@ -163,8 +159,8 @@ def llama_cpp(self):
         return llm
 
     def load_model(
-        self, model_name: Optional[str] = None, pipeline: Optional[str] = None, quantization: Optional[str] = None,
-        is_local: Optional[bool] = None
+            self, model_name: Optional[str] = None, pipeline: Optional[str] = None, quantization: Optional[str] = None,
+            is_local: Optional[bool] = None
     ):
         """Loads the model based on the specified pipeline and model name.
 

diff --git a/src/grag/components/multivec_retriever.py b/src/grag/components/multivec_retriever.py
@@ -11,7 +11,7 @@
 
 from grag.components.parse_pdf import ParsePDF
 from grag.components.text_splitter import TextSplitter
-from grag.components.utils import get_config
+from grag.components.utils import configure_args
 from grag.components.vectordb.base import VectorDB
 from grag.components.vectordb.deeplake_client import DeepLakeClient
 from langchain.retrievers.multi_vector import MultiVectorRetriever
@@ -20,9 +20,8 @@
 from tqdm import tqdm
 from tqdm.asyncio import tqdm as atqdm
 
-multivec_retriever_conf = get_config()["multivec_retriever"]
-
 
+@configure_args
 class Retriever:
     """A class for multi vector retriever.
 
@@ -44,13 +43,13 @@ class Retriever:
     """
 
     def __init__(
-            self,
-            vectordb: Optional[VectorDB] = None,
-            store_path: str = multivec_retriever_conf["store_path"],
-            id_key: str = multivec_retriever_conf["id_key"],
-            namespace: str = multivec_retriever_conf["namespace"],
-            top_k=int(multivec_retriever_conf["top_k"]),
-            client_kwargs: Optional[Dict[str, Any]] = None,
+        self,
+        store_path: Union[str, Path],
+        top_k: str,
+        id_key: str,
+        vectordb: Optional[VectorDB] = None,
+        namespace: Optional[str] = None,
+        client_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """Initialize the Retriever.
 
@@ -66,6 +65,10 @@ def __init__(
         self.id_key = id_key
         self.namespace = uuid.UUID(namespace)
         if vectordb is None:
+            if any([self.store_path is None,
+                    self.id_key is None,
+                    self.namespace is None]):
+                raise TypeError("Arguments [store_path, id_key, namespace] or vectordb must be provided.")
             if client_kwargs is not None:
                 self.vectordb = DeepLakeClient(**client_kwargs)
             else:
@@ -80,7 +83,7 @@ def __init__(
         )
         self.docstore = self.retriever.docstore
         self.splitter = TextSplitter()
-        self.top_k: int = top_k
+        self.top_k: int = int(top_k)
         self.retriever.search_kwargs = {"k": self.top_k}
 
     def id_gen(self, doc: Document) -> str:
@@ -237,12 +240,12 @@ def get_docs_from_chunks(self, chunks: List[Document], one_to_one=False):
                 return [d for d in docs if d is not None]
 
     def ingest(
-            self,
-            dir_path: Union[str, Path],
-            glob_pattern: str = "**/*.pdf",
-            dry_run: bool = False,
-            verbose: bool = True,
-            parser_kwargs: Optional[Dict[str, Any]] = None,
+        self,
+        dir_path: Union[str, Path],
+        glob_pattern: str = "**/*.pdf",
+        dry_run: bool = False,
+        verbose: bool = True,
+        parser_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """Ingests the files in directory.
 
@@ -279,12 +282,12 @@ def ingest(
                     print(f"DRY RUN: found - {filepath.relative_to(dir_path)}")
 
     async def aingest(
-            self,
-            dir_path: Union[str, Path],
-            glob_pattern: str = "**/*.pdf",
-            dry_run: bool = False,
-            verbose: bool = True,
-            parser_kwargs: Optional[Dict[str, Any]] = None,
+        self,
+        dir_path: Union[str, Path],
+        glob_pattern: str = "**/*.pdf",
+        dry_run: bool = False,
+        verbose: bool = True,
+        parser_kwargs: Optional[Dict[str, Any]] = None,
     ):
         """Asynchronously ingests the files in directory.
 

diff --git a/src/grag/components/parse_pdf.py b/src/grag/components/parse_pdf.py
@@ -5,14 +5,12 @@
 - ParsePDF
 """
 
+from grag.components.utils import configure_args
 from langchain_core.documents import Document
 from unstructured.partition.pdf import partition_pdf
 
-from .utils import get_config
-
-parser_conf = get_config()["parser"]
-
 
+@configure_args
 class ParsePDF:
     """Parsing and partitioning PDF documents into Text, Table or Image elements.
 
@@ -29,15 +27,15 @@ class ParsePDF:
     """
 
     def __init__(
-        self,
-        single_text_out=parser_conf["single_text_out"],
-        strategy=parser_conf["strategy"],
-        infer_table_structure=parser_conf["infer_table_structure"],
-        extract_images=parser_conf["extract_images"],
-        image_output_dir=parser_conf["image_output_dir"],
-        add_captions_to_text=parser_conf["add_captions_to_text"],
-        add_captions_to_blocks=parser_conf["add_captions_to_blocks"],
-        table_as_html=parser_conf["table_as_html"],
+            self,
+            single_text_out,
+            strategy,
+            infer_table_structure,
+            extract_images,
+            image_output_dir,
+            add_captions_to_text,
+            add_captions_to_blocks,
+            table_as_html,
     ):
         """Initialize instance variables with parameters."""
         self.strategy = strategy
@@ -96,7 +94,7 @@ def classify(self, partitions):
             if element.category == "Table":
                 if self.add_captions_to_blocks and i + 1 < len(partitions):
                     if (
-                        partitions[i + 1].category == "FigureCaption"
+                            partitions[i + 1].category == "FigureCaption"
                     ):  # check for caption
                         caption_element = partitions[i + 1]
                     else:
@@ -107,7 +105,7 @@ def classify(self, partitions):
             elif element.category == "Image":
                 if self.add_captions_to_blocks and i + 1 < len(partitions):
                     if (
-                        partitions[i + 1].category == "FigureCaption"
+                            partitions[i + 1].category == "FigureCaption"
                     ):  # check for caption
                         caption_element = partitions[i + 1]
                     else:
@@ -195,7 +193,7 @@ def process_tables(self, elements):
 
             if caption_element:
                 if (
-                    self.add_caption_first
+                        self.add_caption_first
                 ):  # if there is a caption, add that before the element
                     content = "\n\n".join([str(caption_element), table_data])
                 else: