diff --git a/ci/branch_Jenkinsfile b/ci/branch_Jenkinsfile index ce070f8..5032e25 100644 --- a/ci/branch_Jenkinsfile +++ b/ci/branch_Jenkinsfile @@ -8,8 +8,8 @@ pipeline { PYTHONPATH = "${env.WORKSPACE}/.venv/bin" CUDACXX = '/usr/local/cuda-12/bin/nvcc' CMAKE_ARGS = "-DLLAMA_CUBLAS=on" - PATH="/usr/local/cuda-12.3/bin:$PATH" - LD_LIBRARY_PATH="/usr/local/cuda-12.3/lib64:$LD_LIBRARY_PATH" + PATH="/usr/local/cuda-12/bin:$PATH" + LD_LIBRARY_PATH="/usr/local/cuda-12/lib64:$LD_LIBRARY_PATH" GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=accept-new" } diff --git a/ci/modify_config.py b/ci/modify_config.py index f210f69..36f262e 100644 --- a/ci/modify_config.py +++ b/ci/modify_config.py @@ -14,5 +14,5 @@ config['llm']['base_dir'] = f'{jenkins_home}/ci_test_models/models' config['env']['env_path'] = f'{jenkins_home}/env_file/.env' -with open(f'{workspace}/src/config.ini', 'w') as configfile: +with open(f'{workspace}/config.ini', 'w') as configfile: config.write(configfile) diff --git a/src/config.ini b/config.ini similarity index 86% rename from src/config.ini rename to config.ini index b382a79..339d77c 100644 --- a/src/config.ini +++ b/config.ini @@ -7,14 +7,14 @@ device_map : auto task : text-generation max_new_tokens : 1024 temperature : 0.1 -n_batch_gpu_cpp : 1024 -n_ctx_cpp : 6000 -n_gpu_layers_cpp : -1 +n_batch : 1024 +n_ctx : 6000 +n_gpu_layers : -1 # The number of layers to put on the GPU. Mixtral-18, gemma-20 std_out : True base_dir : ${root:root_path}/models -[chroma] +[chroma_client] host : localhost port : 8000 collection_name : arxiv @@ -22,10 +22,10 @@ collection_name : arxiv # embedding_model : "all-mpnet-base-v2" embedding_type : instructor-embedding embedding_model : hkunlp/instructor-xl -store_path : ${data:data_path}/vectordb -allow_reset : True +;store_path : ${data:data_path}/vectordb +;allow_reset : True -[deeplake] +[deeplake_client] collection_name : arxiv # embedding_type : sentence-transformers # embedding_model : "all-mpnet-base-v2" @@ -45,7 +45,7 @@ namespace : 8c9040b0b5cd4d7cbc2e737da1b24ebf id_key : doc_id top_k : 3 -[parser] +[parse_pdf] single_text_out : True strategy : hi_res infer_table_structure : True diff --git a/src/docs/get_started.config.rst b/src/docs/get_started.config.rst new file mode 100644 index 0000000..6d4c31e --- /dev/null +++ b/src/docs/get_started.config.rst @@ -0,0 +1,83 @@ +Configuration +=============== + +GRAG gives the user an option to use a config file, in the form of a ``config.ini``. +The use of a config file streamlines the process of passing arguments to the various components in the code. + +File Resolution +**************** +GRAG takes the closest ``config.ini`` to the file you run. This enables users to have multiple config files per project, +if they require or just a single config file at the root. + +Config Format +*************** +``config.ini`` files use the `INI file format `_. + +Each section in the config file is the name of the module and each key value pair is the arguments required by the +class in the module. + +Example Config File +******************* + +:: + + [llm] + model_name : Llama-2-13b-chat + quantization : Q5_K_M + pipeline : llama_cpp + device_map : auto + task : text-generation + max_new_tokens : 1024 + temperature : 0.1 + n_batch : 1024 + n_ctx : 6000 + n_gpu_layers : -1 + std_out : True + base_dir : ${root:root_path}/models + + [chroma_client] + host : localhost + port : 8000 + collection_name : arxiv + embedding_type : instructor-embedding + embedding_model : hkunlp/instructor-xl + store_path : ${data:data_path}/vectordb + allow_reset : True + + [deeplake_client] + collection_name : arxiv + embedding_type : instructor-embedding + embedding_model : hkunlp/instructor-xl + store_path : ${data:data_path}/vectordb + + [text_splitter] + chunk_size : 5000 + chunk_overlap : 400 + + [multivec_retriever] + store_path : ${data:data_path}/doc_store + namespace : 8c9040b0b5cd4d7cbc2e737da1b24ebf + id_key : doc_id + top_k : 3 + + [parse_pdf] + single_text_out : True + strategy : hi_res + infer_table_structure : True + extract_images : True + image_output_dir : None + add_captions_to_text : True + add_captions_to_blocks : True + table_as_html : True + + [data] + data_path : ${root:root_path}/data + + [env] + env_path : ${root:root_path}/.env + + [root] + root_path : /home/ubuntu/Capstone_5 + + [quantize] + llama_cpp_path : ${root:root_path} diff --git a/src/docs/get_started.rst b/src/docs/get_started.rst index 19a2d99..d432a7d 100644 --- a/src/docs/get_started.rst +++ b/src/docs/get_started.rst @@ -8,4 +8,4 @@ Get Started get_started.parse_pdf get_started.llms get_started.vectordb - + get_started.config diff --git a/src/grag/components/llm.py b/src/grag/components/llm.py index dd48f44..1786941 100644 --- a/src/grag/components/llm.py +++ b/src/grag/components/llm.py @@ -5,6 +5,7 @@ from typing import Optional, Union import torch +from grag.components.utils import configure_args from langchain.callbacks.manager import CallbackManager from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler from langchain_community.llms import LlamaCpp @@ -16,13 +17,8 @@ pipeline, ) -from .utils import get_config - -llm_conf = get_config(load_env=True)["llm"] - -print("CUDA: ", torch.cuda.is_available()) - +@configure_args class LLM: """A class for managing and utilizing large language models (LLMs). @@ -38,20 +34,20 @@ class LLM: """ def __init__( - self, - model_name: str = llm_conf["model_name"], - device_map: str = llm_conf["device_map"], - task: str = llm_conf["task"], - max_new_tokens: str = llm_conf["max_new_tokens"], - temperature: str = llm_conf["temperature"], - n_batch: str = llm_conf["n_batch_gpu_cpp"], - n_ctx: str = llm_conf["n_ctx_cpp"], - n_gpu_layers: str = llm_conf["n_gpu_layers_cpp"], - std_out: Union[bool, str] = llm_conf["std_out"], - base_dir: str = llm_conf["base_dir"], - quantization: str = llm_conf["quantization"], - pipeline: str = llm_conf["pipeline"], - callbacks=None, + self, + model_name: str, + device_map: str, + task: str, + max_new_tokens: str, + temperature: str, + n_batch: str, + n_ctx: str, + n_gpu_layers: str, + std_out: Union[bool, str], + base_dir: str, + quantization: str, + pipeline: str, + callbacks=None, ): """Initialize the LLM class using the given parameters.""" self.base_dir = Path(base_dir) @@ -163,8 +159,8 @@ def llama_cpp(self): return llm def load_model( - self, model_name: Optional[str] = None, pipeline: Optional[str] = None, quantization: Optional[str] = None, - is_local: Optional[bool] = None + self, model_name: Optional[str] = None, pipeline: Optional[str] = None, quantization: Optional[str] = None, + is_local: Optional[bool] = None ): """Loads the model based on the specified pipeline and model name. diff --git a/src/grag/components/multivec_retriever.py b/src/grag/components/multivec_retriever.py index a79ee26..3f85fe9 100644 --- a/src/grag/components/multivec_retriever.py +++ b/src/grag/components/multivec_retriever.py @@ -11,7 +11,7 @@ from grag.components.parse_pdf import ParsePDF from grag.components.text_splitter import TextSplitter -from grag.components.utils import get_config +from grag.components.utils import configure_args from grag.components.vectordb.base import VectorDB from grag.components.vectordb.deeplake_client import DeepLakeClient from langchain.retrievers.multi_vector import MultiVectorRetriever @@ -20,9 +20,8 @@ from tqdm import tqdm from tqdm.asyncio import tqdm as atqdm -multivec_retriever_conf = get_config()["multivec_retriever"] - +@configure_args class Retriever: """A class for multi vector retriever. @@ -44,13 +43,13 @@ class Retriever: """ def __init__( - self, - vectordb: Optional[VectorDB] = None, - store_path: str = multivec_retriever_conf["store_path"], - id_key: str = multivec_retriever_conf["id_key"], - namespace: str = multivec_retriever_conf["namespace"], - top_k=int(multivec_retriever_conf["top_k"]), - client_kwargs: Optional[Dict[str, Any]] = None, + self, + store_path: Union[str, Path], + top_k: str, + id_key: str, + vectordb: Optional[VectorDB] = None, + namespace: Optional[str] = None, + client_kwargs: Optional[Dict[str, Any]] = None, ): """Initialize the Retriever. @@ -66,6 +65,10 @@ def __init__( self.id_key = id_key self.namespace = uuid.UUID(namespace) if vectordb is None: + if any([self.store_path is None, + self.id_key is None, + self.namespace is None]): + raise TypeError("Arguments [store_path, id_key, namespace] or vectordb must be provided.") if client_kwargs is not None: self.vectordb = DeepLakeClient(**client_kwargs) else: @@ -80,7 +83,7 @@ def __init__( ) self.docstore = self.retriever.docstore self.splitter = TextSplitter() - self.top_k: int = top_k + self.top_k: int = int(top_k) self.retriever.search_kwargs = {"k": self.top_k} def id_gen(self, doc: Document) -> str: @@ -237,12 +240,12 @@ def get_docs_from_chunks(self, chunks: List[Document], one_to_one=False): return [d for d in docs if d is not None] def ingest( - self, - dir_path: Union[str, Path], - glob_pattern: str = "**/*.pdf", - dry_run: bool = False, - verbose: bool = True, - parser_kwargs: Optional[Dict[str, Any]] = None, + self, + dir_path: Union[str, Path], + glob_pattern: str = "**/*.pdf", + dry_run: bool = False, + verbose: bool = True, + parser_kwargs: Optional[Dict[str, Any]] = None, ): """Ingests the files in directory. @@ -279,12 +282,12 @@ def ingest( print(f"DRY RUN: found - {filepath.relative_to(dir_path)}") async def aingest( - self, - dir_path: Union[str, Path], - glob_pattern: str = "**/*.pdf", - dry_run: bool = False, - verbose: bool = True, - parser_kwargs: Optional[Dict[str, Any]] = None, + self, + dir_path: Union[str, Path], + glob_pattern: str = "**/*.pdf", + dry_run: bool = False, + verbose: bool = True, + parser_kwargs: Optional[Dict[str, Any]] = None, ): """Asynchronously ingests the files in directory. diff --git a/src/grag/components/parse_pdf.py b/src/grag/components/parse_pdf.py index 4344566..3636098 100644 --- a/src/grag/components/parse_pdf.py +++ b/src/grag/components/parse_pdf.py @@ -5,14 +5,12 @@ - ParsePDF """ +from grag.components.utils import configure_args from langchain_core.documents import Document from unstructured.partition.pdf import partition_pdf -from .utils import get_config - -parser_conf = get_config()["parser"] - +@configure_args class ParsePDF: """Parsing and partitioning PDF documents into Text, Table or Image elements. @@ -29,15 +27,15 @@ class ParsePDF: """ def __init__( - self, - single_text_out=parser_conf["single_text_out"], - strategy=parser_conf["strategy"], - infer_table_structure=parser_conf["infer_table_structure"], - extract_images=parser_conf["extract_images"], - image_output_dir=parser_conf["image_output_dir"], - add_captions_to_text=parser_conf["add_captions_to_text"], - add_captions_to_blocks=parser_conf["add_captions_to_blocks"], - table_as_html=parser_conf["table_as_html"], + self, + single_text_out, + strategy, + infer_table_structure, + extract_images, + image_output_dir, + add_captions_to_text, + add_captions_to_blocks, + table_as_html, ): """Initialize instance variables with parameters.""" self.strategy = strategy @@ -96,7 +94,7 @@ def classify(self, partitions): if element.category == "Table": if self.add_captions_to_blocks and i + 1 < len(partitions): if ( - partitions[i + 1].category == "FigureCaption" + partitions[i + 1].category == "FigureCaption" ): # check for caption caption_element = partitions[i + 1] else: @@ -107,7 +105,7 @@ def classify(self, partitions): elif element.category == "Image": if self.add_captions_to_blocks and i + 1 < len(partitions): if ( - partitions[i + 1].category == "FigureCaption" + partitions[i + 1].category == "FigureCaption" ): # check for caption caption_element = partitions[i + 1] else: @@ -195,7 +193,7 @@ def process_tables(self, elements): if caption_element: if ( - self.add_caption_first + self.add_caption_first ): # if there is a caption, add that before the element content = "\n\n".join([str(caption_element), table_data]) else: diff --git a/src/grag/components/text_splitter.py b/src/grag/components/text_splitter.py index 48d2d04..025ec72 100644 --- a/src/grag/components/text_splitter.py +++ b/src/grag/components/text_splitter.py @@ -7,14 +7,11 @@ from typing import Union +from grag.components.utils import configure_args from langchain.text_splitter import RecursiveCharacterTextSplitter -from .utils import get_config -text_splitter_conf = get_config()["text_splitter"] - - -# %% +@configure_args class TextSplitter: """Class for recursively chunking text, it prioritizes '/n/n then '/n' and so on. @@ -24,9 +21,9 @@ class TextSplitter: """ def __init__( - self, - chunk_size: Union[int, str] = text_splitter_conf["chunk_size"], - chunk_overlap: Union[int, str] = text_splitter_conf["chunk_overlap"], + self, + chunk_size: Union[int, str], + chunk_overlap: Union[int, str] ): """Initialize TextSplitter.""" self.text_splitter = RecursiveCharacterTextSplitter( diff --git a/src/grag/components/utils.py b/src/grag/components/utils.py index 44c971d..991550c 100644 --- a/src/grag/components/utils.py +++ b/src/grag/components/utils.py @@ -12,7 +12,9 @@ """ import os +from collections import defaultdict from configparser import ConfigParser, ExtendedInterpolation +from functools import wraps from pathlib import Path from typing import List @@ -32,7 +34,7 @@ def stuff_docs(docs: List[Document]) -> str: return "\n\n".join([doc.page_content for doc in docs]) -def find_config_path(current_path: Path) -> Path: +def find_config_path(current_path: Path): """Finds the path of the 'config.ini' file by traversing up the directory tree from the current path. This function starts at the current path and moves up the directory tree until it finds a file named 'config.ini'. @@ -42,45 +44,81 @@ def find_config_path(current_path: Path) -> Path: current_path (Path): The starting point for the search, typically the location of the script being executed. Returns: - Path: The path to the found 'config.ini' file. - - Raises: - FileNotFoundError: If 'config.ini' cannot be found in any of the parent directories. + Path: None or the path to the found 'config.ini' file. """ config_path = Path("config.ini") while not (current_path / config_path).exists(): current_path = current_path.parent if current_path == current_path.parent: - raise FileNotFoundError(f"config.ini not found in {config_path}.") + # raise FileNotFoundError(f"config.ini not found in {config_path}.") + return None return current_path / config_path -def get_config(load_env=False) -> ConfigParser: +def get_config(load_env=False): """Retrieves and parses the configuration settings from the 'config.ini' file. This function locates the 'config.ini' file by calling `find_config_path` using the script's current location. It initializes a `ConfigParser` object to read the configuration settings from the located 'config.ini' file. + Optionally, it can also load environment variables from a `.env` file specified in the config. + + Args: + load_env (bool): If True, load environment variables from the path specified in the 'config.ini'. Defaults to False. Returns: - ConfigParser: A parser object containing the configuration settings from 'config.ini'. + ConfigParser: A parser object containing the configuration settings from 'config.ini', or a defaultdict + with None if the file is not found or an empty dict{dict{}}. """ - # Assuming this script is somewhere inside your project directory - script_location = Path(__file__).resolve() config_path_ = os.environ.get("CONFIG_PATH") if config_path_: config_path = Path(config_path_) else: + script_location = Path('.').resolve() config_path = find_config_path(script_location) - os.environ["CONFIG_PATH"] = str(config_path) + if config_path is not None: + os.environ["CONFIG_PATH"] = str(config_path) # Initialize parser and read config - config = ConfigParser(interpolation=ExtendedInterpolation()) - config.read(config_path) - print(f"Loaded config from {config_path}.") - # load_dotenv(config['env']['env_path']) - if load_env: - env_path = Path(config['env']['env_path']) - if env_path.exists(): - load_dotenv(env_path) - print(f"Loaded environment variables from {env_path}") - return config + if config_path: + config = ConfigParser(interpolation=ExtendedInterpolation()) + config.read(config_path) + print(f"Loaded config from {config_path}.") + # Load .env + if load_env: + env_path = Path(config['env']['env_path']) + if env_path.exists(): + load_dotenv(env_path) + print(f"Loaded environment variables from {env_path}") + return config + else: + return defaultdict(lambda: defaultdict(lambda: None)) + + +def configure_args(cls): + """Decorator to configure class instantiation arguments from a 'config.ini' file, based on the class's module name. + + This function reads configuration specific to a class's module from 'config.ini', then uses it to override or + provide defaults for keyword arguments passed during class instantiation. + + Args: + cls (class): The class whose instantiation is to be configured. + + Returns: + function: A wrapped class constructor that uses modified arguments based on the configuration. + + Raises: + TypeError: If there is a mismatch in provided arguments and class constructor requirements. + """ + module_namespace = cls.__module__.split('.')[-1] + + config = get_config()[module_namespace] + + @wraps(cls) + def wrapper(*args, **kwargs): + new_kwargs = {**config, **kwargs} + try: + return cls(*args, **new_kwargs) + except TypeError as e: + raise TypeError(f"{e}, or create a config.ini file. ") from e + + return wrapper diff --git a/src/grag/components/vectordb/chroma_client.py b/src/grag/components/vectordb/chroma_client.py index 247bac9..7047171 100644 --- a/src/grag/components/vectordb/chroma_client.py +++ b/src/grag/components/vectordb/chroma_client.py @@ -9,16 +9,15 @@ import chromadb from grag.components.embedding import Embedding -from grag.components.utils import get_config +from grag.components.utils import configure_args from grag.components.vectordb.base import VectorDB from langchain_community.vectorstores import Chroma from langchain_core.documents import Document from tqdm import tqdm from tqdm.asyncio import tqdm as atqdm -chroma_conf = get_config()["chroma"] - +@configure_args class ChromaClient(VectorDB): """A class for connecting to a hosted Chroma Vectorstore collection. @@ -44,12 +43,12 @@ class ChromaClient(VectorDB): """ def __init__( - self, - host: str = chroma_conf["host"], - port: str = chroma_conf["port"], - collection_name: str = chroma_conf["collection_name"], - embedding_type: str = chroma_conf["embedding_type"], - embedding_model: str = chroma_conf["embedding_model"], + self, + host: str, + port: str, + collection_name: str, + embedding_type: str, + embedding_model: str, ): """Initialize a ChromaClient object. @@ -126,7 +125,7 @@ def add_docs(self, docs: List[Document], verbose=True) -> None: """ docs = self._filter_metadata(docs) for doc in ( - tqdm(docs, desc=f"Adding to {self.collection_name}:") if verbose else docs + tqdm(docs, desc=f"Adding to {self.collection_name}:") if verbose else docs ): _id = self.langchain_client.add_documents([doc]) @@ -143,9 +142,9 @@ async def aadd_docs(self, docs: List[Document], verbose=True) -> None: docs = self._filter_metadata(docs) if verbose: for doc in atqdm( - docs, - desc=f"Adding documents to {self.collection_name}", - total=len(docs), + docs, + desc=f"Adding documents to {self.collection_name}", + total=len(docs), ): await self.langchain_client.aadd_documents([doc]) else: @@ -153,7 +152,7 @@ async def aadd_docs(self, docs: List[Document], verbose=True) -> None: await self.langchain_client.aadd_documents([doc]) def get_chunk( - self, query: str, with_score: bool = False, top_k: Optional[int] = None + self, query: str, with_score: bool = False, top_k: Optional[int] = None ) -> Union[List[Document], List[Tuple[Document, float]]]: """Returns the most similar chunks from the chroma database. @@ -176,7 +175,7 @@ def get_chunk( ) async def aget_chunk( - self, query: str, with_score=False, top_k=None + self, query: str, with_score=False, top_k=None ) -> Union[List[Document], List[Tuple[Document, float]]]: """Returns the most (cosine) similar chunks from the vector database, asynchronously. diff --git a/src/grag/components/vectordb/deeplake_client.py b/src/grag/components/vectordb/deeplake_client.py index e0d2df2..5c9cbf9 100644 --- a/src/grag/components/vectordb/deeplake_client.py +++ b/src/grag/components/vectordb/deeplake_client.py @@ -9,16 +9,15 @@ from typing import List, Optional, Tuple, Union from grag.components.embedding import Embedding -from grag.components.utils import get_config +from grag.components.utils import configure_args from grag.components.vectordb.base import VectorDB from langchain_community.vectorstores import DeepLake from langchain_core.documents import Document from tqdm import tqdm from tqdm.asyncio import tqdm as atqdm -deeplake_conf = get_config()["deeplake"] - +@configure_args class DeepLakeClient(VectorDB): """A class for connecting to a DeepLake Vectorstore. @@ -40,12 +39,12 @@ class DeepLakeClient(VectorDB): """ def __init__( - self, - collection_name: str = deeplake_conf["collection_name"], - store_path: Union[str, Path] = deeplake_conf["store_path"], - embedding_type: str = deeplake_conf["embedding_type"], - embedding_model: str = deeplake_conf["embedding_model"], - read_only: bool = False, + self, + collection_name: str, + store_path: Union[str, Path], + embedding_type: str, + embedding_model: str, + read_only: bool = False, ): """Initialize DeepLake client object.""" self.store_path = Path(store_path) @@ -87,7 +86,7 @@ def add_docs(self, docs: List[Document], verbose=True) -> None: """ docs = self._filter_metadata(docs) for doc in ( - tqdm(docs, desc=f"Adding to {self.collection_name}:") if verbose else docs + tqdm(docs, desc=f"Adding to {self.collection_name}:") if verbose else docs ): _id = self.langchain_client.add_documents([doc]) @@ -104,9 +103,9 @@ async def aadd_docs(self, docs: List[Document], verbose=True) -> None: docs = self._filter_metadata(docs) if verbose: for doc in atqdm( - docs, - desc=f"Adding documents to {self.collection_name}", - total=len(docs), + docs, + desc=f"Adding documents to {self.collection_name}", + total=len(docs), ): await self.langchain_client.aadd_documents([doc]) else: @@ -114,7 +113,7 @@ async def aadd_docs(self, docs: List[Document], verbose=True) -> None: await self.langchain_client.aadd_documents([doc]) def get_chunk( - self, query: str, with_score: bool = False, top_k: Optional[int] = None + self, query: str, with_score: bool = False, top_k: Optional[int] = None ) -> Union[List[Document], List[Tuple[Document, float]]]: """Returns the most similar chunks from the deeplake database. @@ -137,7 +136,7 @@ def get_chunk( ) async def aget_chunk( - self, query: str, with_score=False, top_k=None + self, query: str, with_score=False, top_k=None ) -> Union[List[Document], List[Tuple[Document, float]]]: """Returns the most similar chunks from the deeplake database, asynchronously.