diff --git a/simple-rag/.env.example b/simple-rag/.env.example new file mode 100644 index 0000000..36d272d --- /dev/null +++ b/simple-rag/.env.example @@ -0,0 +1 @@ +OPENAI_API_KEY = "" \ No newline at end of file diff --git a/simple-rag/README.md b/simple-rag/README.md new file mode 100644 index 0000000..041eeec --- /dev/null +++ b/simple-rag/README.md @@ -0,0 +1,61 @@ +# simple-rag + +A minimal implementation of Retrieval-Augmented Generation (RAG) in Python. + +## Features + +- Document retrieval using Chroma vector database (in-memory) +- Uses OpenAI embeddings for document indexing +- Integrates OpenAI models for answer generation +- Simple and extensible codebase + +## Installation + +```bash +git clone +cd simple-rag +pip install -r requirements.txt + +vi .env # add your openai api key + +python rag.py +``` + +## Example + +```bash +python rag.py + +INFO:__main__:Loaded 132 pages from principles_2nd_edition_updated.pdf... +---------------------------------------------------------------------------------------------------- + +INFO:chroma:Initializing Chroma vector store... +INFO:root:Creating text-embedding-3-small... +INFO:chromadb.telemetry.product.posthog:Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information. +INFO:chroma:Storing 132 documents in the vector store... +INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK" +---------------------------------------------------------------------------------------------------- + +Enter your query: Can you get me a brief summary please? +INFO:chroma:Retrieving documents similar to the query: Can you get me a brief summary please? +INFO:chroma:Creating retriever with MMR search... +INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK" +---------------------------------------------------------------------------------------------------- + +INFO:__main__:Organized retrieval results... +---------------------------------------------------------------------------------------------------- + +INFO:root:Generating answer with gpt-4o-mini... +INFO:httpx:HTTP Request: POST https://api.openai.com/v1/responses "HTTP/1.1 200 OK" +---------------------------------------------------------------------------------------------------- + + +Answer: + +The text discusses key concepts in building AI agents, particularly focusing on memory systems and tracing for debugging. It highlights hierarchical memory, which combines recent interactions with long-term memories to formulate responses, showcasing how this works in a practical example. The text also touches on tracing, a method for monitoring functions in applications to visualize input and output, emphasizing the importance of standardization through OpenTelemetry. The author, Sam Bhagwat, emphasizes the relevance of these principles for developing effective AI applications, particularly in the context of the rapid advancements in large language models. +``` + +## Contributing + +Contributions are welcome! Please open issues or submit pull requests. + diff --git a/simple-rag/__init__.py b/simple-rag/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/simple-rag/chroma.py b/simple-rag/chroma.py new file mode 100644 index 0000000..314f2e3 --- /dev/null +++ b/simple-rag/chroma.py @@ -0,0 +1,52 @@ +from langchain_chroma import Chroma +from uuid import uuid4 +from langchain_core.documents.base import Document + +from openai_service import get_openai_embeddings +from settings import settings + +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def get_chroma_store(): + logger.info("Initializing Chroma vector store...") + return Chroma( + collection_name=settings.CHROMA_COLLECTION_NAME, + embedding_function=get_openai_embeddings(), + persist_directory=settings.CHROMA_PERSIST_DIRECTORY, + ) + + +async def store_documents(vector_store: Chroma, documents: list[Document]): + logger.info(f"Storing {len(documents)} documents in the vector store...") + uuids = [str(uuid4()) for _ in range(len(documents))] + await vector_store.aadd_documents(documents=documents, ids=uuids) + + +def get_retriever( + vector_store: Chroma, + k: int = 5, + score_threshold: float = 0.45 +): + logger.info("Creating retriever with MMR search...") + return vector_store.as_retriever( + search_type="mmr", + search_kwargs={ + "k": k, + "score_threshold": score_threshold, + } + ) + + +async def retrieve_similar_documents( + vector_store: Chroma, + query: str, + k: int = 5, + score_threshold: float = 0.45 +) -> list[Document]: + logger.info(f"Retrieving documents similar to the query: {query}") + retriever = get_retriever(vector_store, k, score_threshold) + return await retriever.ainvoke(query) diff --git a/simple-rag/openai_service.py b/simple-rag/openai_service.py new file mode 100644 index 0000000..15029c9 --- /dev/null +++ b/simple-rag/openai_service.py @@ -0,0 +1,49 @@ +from langchain_openai import OpenAIEmbeddings +from settings import settings +from openai import OpenAI + +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +client = OpenAI() + + +def get_openai_embeddings(): + logging.info(f"Creating {settings.EMBEDDING_MODEL}...") + return OpenAIEmbeddings( + model=settings.EMBEDDING_MODEL, api_key=settings.OPENAI_API_KEY + ) + + +def get_answer_from_openai( + query: str, + information: str, + temperature: float = 0.5 +) -> str: + logging.info(f"Generating answer with {settings.COMPLETION_MODEL}...") + + user_content = [ + { + "type": "input_text", + "text": "Use the given relevant information, and answer the query", + } + ] + user_content.append({"type": "input_text", "text": f"\nQuery: {query}"}) + user_content.append( + { + "type": "input_text", + "text": f"\nRelevant Information: : {information}", + } + ) + + messages = [{"role": "user", "content": user_content}] + + response = client.responses.create( + model=settings.COMPLETION_MODEL, + input=messages, + temperature=temperature + ) + + return response.output[0].content[0].text diff --git a/simple-rag/rag.py b/simple-rag/rag.py new file mode 100644 index 0000000..ae31081 --- /dev/null +++ b/simple-rag/rag.py @@ -0,0 +1,91 @@ +from pathlib import Path +from chroma import ( + get_chroma_store, + store_documents, + retrieve_similar_documents +) + +from openai_service import get_answer_from_openai +from langchain_core.documents.base import Document +import fitz +import pymupdf + +import logging + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + + +def load_pdf(file_path: Path) -> list[Document]: + """Load a PDF file and return its content as a list of Document objects.""" + documents = [] + + filename = file_path.name + with fitz.open(file_path) as pdf: + total_pages = len(pdf) + + for page_num in range(len(pdf)): + page = pdf.load_page(page_num) + text = page.get_text() + + if not text.strip(): + continue + + metadata = { + "source": filename, + "page_number": page_num + 1, + "total_pages": total_pages + } + + documents.append(Document(page_content=text, metadata=metadata)) + logger.info(f"Loaded {len(documents)} pages from {filename}...") + return documents + + +def organize_retrieval_results(results: list[Document]) -> dict: + organized_results = set() + for doc in results: + page_content = doc.page_content + page_content = page_content.replace("\n", " ").strip() + page_number = doc.metadata.get("page_number", "N/A") + total_pages = doc.metadata.get("total_pages", "N/A") + page_content = f"Page {page_number} of {total_pages}\n\n{page_content}" + + if page_content not in organized_results: + organized_results.add(page_content) + + retrieved_information = "The relevant information for the given query:\n\n" + organized_results = "\n\n".join(organized_results) + retrieved_information += organized_results.lstrip().strip() + + logger.info("Organized retrieval results...") + return retrieved_information + + +if __name__ == "__main__": + filepath = Path(input("Enter the path to the PDF file: ").strip()) + + try: + docs = load_pdf(filepath) + except pymupdf.FileNotFoundError: + logger.error(f"File not found: {filepath}") + exit(1) + print(f"{'--' * 50}\n") + + vector_store = get_chroma_store() + + import asyncio + asyncio.run(store_documents(vector_store, docs)) + print(f"{'--' * 50}\n") + + query = input("Enter your query: ") + results = asyncio.run(retrieve_similar_documents(vector_store, query)) + print(f"{'--' * 50}\n") + + organized_results = organize_retrieval_results(results) + print(f"{'--' * 50}\n") + + answer = get_answer_from_openai(query, organized_results) + print(f"{'--' * 50}\n") + print("\nAnswer:\n") + print(answer) diff --git a/simple-rag/requirements.txt b/simple-rag/requirements.txt new file mode 100644 index 0000000..ef14bf2 --- /dev/null +++ b/simple-rag/requirements.txt @@ -0,0 +1,8 @@ +langchain-chroma +langchain-openai +python-dotenv +pydantic-settings +langchain-core +langchain +PyMuPDF +openai \ No newline at end of file diff --git a/simple-rag/settings.py b/simple-rag/settings.py new file mode 100644 index 0000000..a0e7381 --- /dev/null +++ b/simple-rag/settings.py @@ -0,0 +1,23 @@ +import os + +from dotenv import load_dotenv +from pydantic_settings import BaseSettings + +load_dotenv() + + +class Settings(BaseSettings): + + OPENAI_API_KEY: str = os.getenv("OPENAI_API_KEY") + EMBEDDING_MODEL: str = "text-embedding-3-small" + COMPLETION_MODEL: str = "gpt-4o-mini" + CHROMA_PERSIST_DIRECTORY: str = "chroma_db" + CHROMA_COLLECTION_NAME: str = "main" + + class Config: + env_file = ".env" + case_sensitive = True + extra = "ignore" + + +settings = Settings()