Skip to content

Commit

Permalink
Merge pull request #103 from arjbingly/config-parsing
Browse files Browse the repository at this point in the history
Config Wrapper
  • Loading branch information
arjbingly authored Apr 26, 2024
2 parents 2b8f8e5 + 145d35a commit 2e44f24
Show file tree
Hide file tree
Showing 12 changed files with 245 additions and 132 deletions.
4 changes: 2 additions & 2 deletions ci/branch_Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@ pipeline {
PYTHONPATH = "${env.WORKSPACE}/.venv/bin"
CUDACXX = '/usr/local/cuda-12/bin/nvcc'
CMAKE_ARGS = "-DLLAMA_CUBLAS=on"
PATH="/usr/local/cuda-12.3/bin:$PATH"
LD_LIBRARY_PATH="/usr/local/cuda-12.3/lib64:$LD_LIBRARY_PATH"
PATH="/usr/local/cuda-12/bin:$PATH"
LD_LIBRARY_PATH="/usr/local/cuda-12/lib64:$LD_LIBRARY_PATH"
GIT_SSH_COMMAND="ssh -o StrictHostKeyChecking=accept-new"
}

Expand Down
2 changes: 1 addition & 1 deletion ci/modify_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@
config['llm']['base_dir'] = f'{jenkins_home}/ci_test_models/models'
config['env']['env_path'] = f'{jenkins_home}/env_file/.env'

with open(f'{workspace}/src/config.ini', 'w') as configfile:
with open(f'{workspace}/config.ini', 'w') as configfile:
config.write(configfile)
16 changes: 8 additions & 8 deletions src/config.ini → config.ini
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,25 @@ device_map : auto
task : text-generation
max_new_tokens : 1024
temperature : 0.1
n_batch_gpu_cpp : 1024
n_ctx_cpp : 6000
n_gpu_layers_cpp : -1
n_batch : 1024
n_ctx : 6000
n_gpu_layers : -1
# The number of layers to put on the GPU. Mixtral-18, gemma-20
std_out : True
base_dir : ${root:root_path}/models

[chroma]
[chroma_client]
host : localhost
port : 8000
collection_name : arxiv
# embedding_type : sentence-transformers
# embedding_model : "all-mpnet-base-v2"
embedding_type : instructor-embedding
embedding_model : hkunlp/instructor-xl
store_path : ${data:data_path}/vectordb
allow_reset : True
;store_path : ${data:data_path}/vectordb
;allow_reset : True

[deeplake]
[deeplake_client]
collection_name : arxiv
# embedding_type : sentence-transformers
# embedding_model : "all-mpnet-base-v2"
Expand All @@ -45,7 +45,7 @@ namespace : 8c9040b0b5cd4d7cbc2e737da1b24ebf
id_key : doc_id
top_k : 3

[parser]
[parse_pdf]
single_text_out : True
strategy : hi_res
infer_table_structure : True
Expand Down
83 changes: 83 additions & 0 deletions src/docs/get_started.config.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
Configuration
===============

GRAG gives the user an option to use a config file, in the form of a ``config.ini``.
The use of a config file streamlines the process of passing arguments to the various components in the code.

File Resolution
****************
GRAG takes the closest ``config.ini`` to the file you run. This enables users to have multiple config files per project,
if they require or just a single config file at the root.

Config Format
***************
``config.ini`` files use the `INI file format <https://en.wikipedia.org/wiki/INI_file>`_.

Each section in the config file is the name of the module and each key value pair is the arguments required by the
class in the module.

Example Config File
*******************

::

[llm]
model_name : Llama-2-13b-chat
quantization : Q5_K_M
pipeline : llama_cpp
device_map : auto
task : text-generation
max_new_tokens : 1024
temperature : 0.1
n_batch : 1024
n_ctx : 6000
n_gpu_layers : -1
std_out : True
base_dir : ${root:root_path}/models

[chroma_client]
host : localhost
port : 8000
collection_name : arxiv
embedding_type : instructor-embedding
embedding_model : hkunlp/instructor-xl
store_path : ${data:data_path}/vectordb
allow_reset : True

[deeplake_client]
collection_name : arxiv
embedding_type : instructor-embedding
embedding_model : hkunlp/instructor-xl
store_path : ${data:data_path}/vectordb

[text_splitter]
chunk_size : 5000
chunk_overlap : 400

[multivec_retriever]
store_path : ${data:data_path}/doc_store
namespace : 8c9040b0b5cd4d7cbc2e737da1b24ebf
id_key : doc_id
top_k : 3

[parse_pdf]
single_text_out : True
strategy : hi_res
infer_table_structure : True
extract_images : True
image_output_dir : None
add_captions_to_text : True
add_captions_to_blocks : True
table_as_html : True

[data]
data_path : ${root:root_path}/data

[env]
env_path : ${root:root_path}/.env

[root]
root_path : /home/ubuntu/Capstone_5

[quantize]
llama_cpp_path : ${root:root_path}
2 changes: 1 addition & 1 deletion src/docs/get_started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ Get Started
get_started.parse_pdf
get_started.llms
get_started.vectordb

get_started.config
40 changes: 18 additions & 22 deletions src/grag/components/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from typing import Optional, Union

import torch
from grag.components.utils import configure_args
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp
Expand All @@ -16,13 +17,8 @@
pipeline,
)

from .utils import get_config

llm_conf = get_config(load_env=True)["llm"]

print("CUDA: ", torch.cuda.is_available())


@configure_args
class LLM:
"""A class for managing and utilizing large language models (LLMs).
Expand All @@ -38,20 +34,20 @@ class LLM:
"""

def __init__(
self,
model_name: str = llm_conf["model_name"],
device_map: str = llm_conf["device_map"],
task: str = llm_conf["task"],
max_new_tokens: str = llm_conf["max_new_tokens"],
temperature: str = llm_conf["temperature"],
n_batch: str = llm_conf["n_batch_gpu_cpp"],
n_ctx: str = llm_conf["n_ctx_cpp"],
n_gpu_layers: str = llm_conf["n_gpu_layers_cpp"],
std_out: Union[bool, str] = llm_conf["std_out"],
base_dir: str = llm_conf["base_dir"],
quantization: str = llm_conf["quantization"],
pipeline: str = llm_conf["pipeline"],
callbacks=None,
self,
model_name: str,
device_map: str,
task: str,
max_new_tokens: str,
temperature: str,
n_batch: str,
n_ctx: str,
n_gpu_layers: str,
std_out: Union[bool, str],
base_dir: str,
quantization: str,
pipeline: str,
callbacks=None,
):
"""Initialize the LLM class using the given parameters."""
self.base_dir = Path(base_dir)
Expand Down Expand Up @@ -163,8 +159,8 @@ def llama_cpp(self):
return llm

def load_model(
self, model_name: Optional[str] = None, pipeline: Optional[str] = None, quantization: Optional[str] = None,
is_local: Optional[bool] = None
self, model_name: Optional[str] = None, pipeline: Optional[str] = None, quantization: Optional[str] = None,
is_local: Optional[bool] = None
):
"""Loads the model based on the specified pipeline and model name.
Expand Down
49 changes: 26 additions & 23 deletions src/grag/components/multivec_retriever.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

from grag.components.parse_pdf import ParsePDF
from grag.components.text_splitter import TextSplitter
from grag.components.utils import get_config
from grag.components.utils import configure_args
from grag.components.vectordb.base import VectorDB
from grag.components.vectordb.deeplake_client import DeepLakeClient
from langchain.retrievers.multi_vector import MultiVectorRetriever
Expand All @@ -20,9 +20,8 @@
from tqdm import tqdm
from tqdm.asyncio import tqdm as atqdm

multivec_retriever_conf = get_config()["multivec_retriever"]


@configure_args
class Retriever:
"""A class for multi vector retriever.
Expand All @@ -44,13 +43,13 @@ class Retriever:
"""

def __init__(
self,
vectordb: Optional[VectorDB] = None,
store_path: str = multivec_retriever_conf["store_path"],
id_key: str = multivec_retriever_conf["id_key"],
namespace: str = multivec_retriever_conf["namespace"],
top_k=int(multivec_retriever_conf["top_k"]),
client_kwargs: Optional[Dict[str, Any]] = None,
self,
store_path: Union[str, Path],
top_k: str,
id_key: str,
vectordb: Optional[VectorDB] = None,
namespace: Optional[str] = None,
client_kwargs: Optional[Dict[str, Any]] = None,
):
"""Initialize the Retriever.
Expand All @@ -66,6 +65,10 @@ def __init__(
self.id_key = id_key
self.namespace = uuid.UUID(namespace)
if vectordb is None:
if any([self.store_path is None,
self.id_key is None,
self.namespace is None]):
raise TypeError("Arguments [store_path, id_key, namespace] or vectordb must be provided.")
if client_kwargs is not None:
self.vectordb = DeepLakeClient(**client_kwargs)
else:
Expand All @@ -80,7 +83,7 @@ def __init__(
)
self.docstore = self.retriever.docstore
self.splitter = TextSplitter()
self.top_k: int = top_k
self.top_k: int = int(top_k)
self.retriever.search_kwargs = {"k": self.top_k}

def id_gen(self, doc: Document) -> str:
Expand Down Expand Up @@ -237,12 +240,12 @@ def get_docs_from_chunks(self, chunks: List[Document], one_to_one=False):
return [d for d in docs if d is not None]

def ingest(
self,
dir_path: Union[str, Path],
glob_pattern: str = "**/*.pdf",
dry_run: bool = False,
verbose: bool = True,
parser_kwargs: Optional[Dict[str, Any]] = None,
self,
dir_path: Union[str, Path],
glob_pattern: str = "**/*.pdf",
dry_run: bool = False,
verbose: bool = True,
parser_kwargs: Optional[Dict[str, Any]] = None,
):
"""Ingests the files in directory.
Expand Down Expand Up @@ -279,12 +282,12 @@ def ingest(
print(f"DRY RUN: found - {filepath.relative_to(dir_path)}")

async def aingest(
self,
dir_path: Union[str, Path],
glob_pattern: str = "**/*.pdf",
dry_run: bool = False,
verbose: bool = True,
parser_kwargs: Optional[Dict[str, Any]] = None,
self,
dir_path: Union[str, Path],
glob_pattern: str = "**/*.pdf",
dry_run: bool = False,
verbose: bool = True,
parser_kwargs: Optional[Dict[str, Any]] = None,
):
"""Asynchronously ingests the files in directory.
Expand Down
30 changes: 14 additions & 16 deletions src/grag/components/parse_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,12 @@
- ParsePDF
"""

from grag.components.utils import configure_args
from langchain_core.documents import Document
from unstructured.partition.pdf import partition_pdf

from .utils import get_config

parser_conf = get_config()["parser"]


@configure_args
class ParsePDF:
"""Parsing and partitioning PDF documents into Text, Table or Image elements.
Expand All @@ -29,15 +27,15 @@ class ParsePDF:
"""

def __init__(
self,
single_text_out=parser_conf["single_text_out"],
strategy=parser_conf["strategy"],
infer_table_structure=parser_conf["infer_table_structure"],
extract_images=parser_conf["extract_images"],
image_output_dir=parser_conf["image_output_dir"],
add_captions_to_text=parser_conf["add_captions_to_text"],
add_captions_to_blocks=parser_conf["add_captions_to_blocks"],
table_as_html=parser_conf["table_as_html"],
self,
single_text_out,
strategy,
infer_table_structure,
extract_images,
image_output_dir,
add_captions_to_text,
add_captions_to_blocks,
table_as_html,
):
"""Initialize instance variables with parameters."""
self.strategy = strategy
Expand Down Expand Up @@ -96,7 +94,7 @@ def classify(self, partitions):
if element.category == "Table":
if self.add_captions_to_blocks and i + 1 < len(partitions):
if (
partitions[i + 1].category == "FigureCaption"
partitions[i + 1].category == "FigureCaption"
): # check for caption
caption_element = partitions[i + 1]
else:
Expand All @@ -107,7 +105,7 @@ def classify(self, partitions):
elif element.category == "Image":
if self.add_captions_to_blocks and i + 1 < len(partitions):
if (
partitions[i + 1].category == "FigureCaption"
partitions[i + 1].category == "FigureCaption"
): # check for caption
caption_element = partitions[i + 1]
else:
Expand Down Expand Up @@ -195,7 +193,7 @@ def process_tables(self, elements):

if caption_element:
if (
self.add_caption_first
self.add_caption_first
): # if there is a caption, add that before the element
content = "\n\n".join([str(caption_element), table_data])
else:
Expand Down
Loading

0 comments on commit 2e44f24

Please sign in to comment.