From 55650db3672d414059565bc3aa3b775badd2bd3f Mon Sep 17 00:00:00 2001 From: Rajendra Kadam Date: Tue, 19 Mar 2024 23:09:40 +0530 Subject: [PATCH 1/2] Use remote qdrant vectordb, Separate data load from rag app init --- .../pebblo_identity_rag-pinecone.py | 94 ------------ .../pebblo_identity_rag-qdrant.py | 137 +++++++++++++----- .../identity-rag/pinecone_data_loader.py | 59 -------- 3 files changed, 97 insertions(+), 193 deletions(-) delete mode 100644 pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag-pinecone.py delete mode 100644 pebblo_saferetriever/langchain/identity-rag/pinecone_data_loader.py diff --git a/pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag-pinecone.py b/pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag-pinecone.py deleted file mode 100644 index 6ad4415c..00000000 --- a/pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag-pinecone.py +++ /dev/null @@ -1,94 +0,0 @@ -import os - -# Fill-in OPENAI_API_KEY in .env file -# in this directory before proceeding -from dotenv import load_dotenv -from langchain.chains import PebbloRetrievalQA -from langchain_community.document_loaders import ( - GoogleDriveLoader, - UnstructuredFileIOLoader, -) -from langchain_community.document_loaders.pebblo import PebbloSafeLoader -from langchain_openai.embeddings import OpenAIEmbeddings -from langchain_openai.llms import OpenAI - -from google_auth import get_authorized_identities -from pinecone_data_loader import create_pinecone_index - -load_dotenv() - - -class PebbloIdentityRAG: - def __init__(self, folder_id: str, index_name: str, pinecone_api_key: str): - self.app_name = "pebblo-identity-rag-1" - self.index_name = index_name - self.pinecone_api_key = pinecone_api_key - - # Load documents - print("Loading RAG documents ...") - self.loader = PebbloSafeLoader( - GoogleDriveLoader( - folder_id=folder_id, - token_path="./google_token.json", - recursive=True, - file_loader_cls=UnstructuredFileIOLoader, - file_loader_kwargs={"mode": "elements"}, - load_auth=True, - ), - name=self.app_name, # App name (Mandatory) - owner="Joe Smith", # Owner (Optional) - description="Identity enabled SafeLoader and SafeRetrival app using Pebblo", # Description (Optional) - ) - self.documents = self.loader.load() - - print(self.documents[-1].metadata.get("authorized_identities")) - print(f"Loaded {len(self.documents)} documents ...\n") - - # Load documents into VectorDB - print("Hydrating Vector DB ...") - self.vectordb = self.embeddings() - print("Finished hydrating Vector DB ...\n") - - # Prepare LLM - self.llm = OpenAI() - - def embeddings(self): - """ - Create embeddings from documents - """ - embeddings = OpenAIEmbeddings() - vectordb = create_pinecone_index( - self.pinecone_api_key, self.index_name, embeddings, self.documents - ) - return vectordb - - def ask(self, question: str, auth: dict): - # Prepare retriever QA chain - retriever = PebbloRetrievalQA.from_chain_type( - llm=self.llm, - chain_type="stuff", - retriever=self.vectordb.as_retriever(), - verbose=True, - auth_context=auth, - ) - return retriever.invoke(question) - - -if __name__ == "__main__": - # TODO: pass the actual GoogleDrive folder id - # folder_id = "1sd0RqMMJKidf9Pb4YRCI2-NH4Udj885k" - - folder_id = "1sRvP0j6L6M_Ll0y_8Qp7cFWUOlpdbfN5" - index_name = "identity-enabled-rag" - pinecone_api_key = os.environ.get("PINECONE_API_KEY") - rag_app = PebbloIdentityRAG(folder_id, index_name, pinecone_api_key) - - prompt = "What criteria are used to evaluate employee performance during performance reviews?" - print(f"Query:\n{prompt}") - - user_1 = "user@clouddefense.io" - auth = { - "authorized_identities": get_authorized_identities(user_1), - } - response = rag_app.ask(prompt, auth) - print(f"Response:\n{response}") diff --git a/pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag-qdrant.py b/pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag-qdrant.py index c9f803fd..4d55f4b3 100644 --- a/pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag-qdrant.py +++ b/pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag-qdrant.py @@ -1,32 +1,63 @@ -from typing import List +import os -# Fill-in OPENAI_API_KEY in .env file -# in this directory before proceeding from dotenv import load_dotenv from langchain.chains import PebbloRetrievalQA -from langchain.schema import Document from langchain_community.document_loaders import ( GoogleDriveLoader, UnstructuredFileIOLoader, ) from langchain_community.document_loaders.pebblo import PebbloSafeLoader from langchain_community.vectorstores.qdrant import Qdrant -from langchain_openai.embeddings import OpenAIEmbeddings +from langchain_openai import OpenAIEmbeddings from langchain_openai.llms import OpenAI +from qdrant_client import QdrantClient from google_auth import get_authorized_identities load_dotenv() +# Get Qdrant API key from environment +QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY", "your-qdrant-api-key") +# Qdrant URL +QDRANT_URL = os.environ.get("QDRANT_URL") +# Qdrant DB collection name +DEFAULT_COLLECTION_NAME = "identity-enabled-rag" + +print( + f"QDRANT Config: \n\tKey: {QDRANT_API_KEY[:10]}{'x' * len(QDRANT_API_KEY[10:])}, \n\tUrl: {QDRANT_URL}\n" +) + class PebbloIdentityRAG: - def __init__(self, folder_id: str, collection_name: str): - self.app_name = "pebblo-identity-rag-1" - self.collection_name = collection_name + def __init__(self, collection_name: str = DEFAULT_COLLECTION_NAME): + self.app_name = "acme-corp-rag-1" + self.qdrant_collection_name = collection_name + self.llm = OpenAI() + self.embeddings = OpenAIEmbeddings() + self.vectordb = self.init_vector_db() + + def init_vector_db(self): + """ + Load Vector DB + """ + client = QdrantClient( + url=QDRANT_URL, + api_key=QDRANT_API_KEY, + ) + vectordb = Qdrant( + client=client, + collection_name=self.qdrant_collection_name, + embeddings=self.embeddings, + ) + return vectordb + + def load_documents_to_vector_db(self, folder_id: str): + """ + Load documents to Qdrant Vector DB + """ + print("Loading documents...") - # Load documents - print("Loading RAG documents ...") - self.loader = PebbloSafeLoader( + loader = PebbloSafeLoader( GoogleDriveLoader( folder_id=folder_id, token_path="./google_token.json", @@ -39,53 +70,79 @@ def __init__(self, folder_id: str, collection_name: str): owner="Joe Smith", # Owner (Optional) description="Identity enabled SafeLoader and SafeRetrival app using Pebblo", # Description (Optional) ) - self.documents = self.loader.load() + documents = loader.load() + print(f"Loaded {len(documents)} documents ...\n") + print(f"First document: {documents[0]}") - print(self.documents[-1].metadata.get("authorized_identities")) - print(f"Loaded {len(self.documents)} documents ...\n") # Load documents into VectorDB print("Hydrating Vector DB ...") - self.vectordb = self.embeddings(self.documents) - print("Finished hydrating Vector DB ...\n") - - # Prepare LLM - self.llm = OpenAI() - def embeddings(self, docs: List[Document]): - embeddings = OpenAIEmbeddings() - vectordb = Qdrant.from_documents( - docs, - embeddings, - location=":memory:", - collection_name=self.collection_name, + _vectordb = Qdrant.from_documents( + documents, + self.embeddings, + url=QDRANT_URL, + api_key=QDRANT_API_KEY, + collection_name=self.qdrant_collection_name, ) - return vectordb + print("Finished hydrating Vector DB ...\n") - def ask(self, question: str, auth: dict): + def ask(self, question: str, auth_context: dict): # Prepare retriever QA chain retriever = PebbloRetrievalQA.from_chain_type( llm=self.llm, chain_type="stuff", retriever=self.vectordb.as_retriever(), verbose=True, - auth_context=auth, + auth_context=auth_context, ) return retriever.invoke(question) if __name__ == "__main__": # TODO: pass the actual GoogleDrive folder id - folder_id = "1sRvP0j6L6M_Ll0y_8Qp7cFWUOlpdbfN5" - collection_name = "identity-enabled-rag" - rag_app = PebbloIdentityRAG(folder_id, collection_name) + def_folder_id = "1sRvP0j6L6M_Ll0y_8Qp7cFWUOlpdbfN5" + def_service_acc_path = "credentials/service-account.json" + def_ingestion_user_email_address = "user@clouddefense.io" + input_collection_name = "identity-enabled-rag" + + rag_app = PebbloIdentityRAG(input_collection_name) + + print("Please enter ingestion user details for loading data...") + ingestion_user_email_address = ( + input(f"email address ({def_ingestion_user_email_address}) : ") + or def_ingestion_user_email_address + ) + ingestion_user_service_account_path = ( + input(f"service-account.json path ({def_service_acc_path}) : ") + or def_service_acc_path + ) + input_folder_id = input(f"Folder id ({def_folder_id}): ") or def_folder_id + + # Load documents to Qdrant Vector DB + rag_app.load_documents_to_vector_db(input_folder_id) + + while True: + print("Please enter end user details below") + end_user_email_address = input("User email address : ") + prompt = input("Please provide the prompt : ") + print(f"User: {end_user_email_address}.\nQuery:{prompt}\n") - prompt = "What criteria are used to evaluate employee performance during performance reviews?" - print(f"Query:\n{prompt}") + auth = { + "authorized_identities": get_authorized_identities( + admin_user_email_address=ingestion_user_email_address, + credentials_file_path=ingestion_user_service_account_path, + user_email=end_user_email_address, + ) + } + response = rag_app.ask(prompt, auth) + print(f"Response:\n{response}") + try: + continue_or_exist = int(input("\n\nType 1 to continue and 0 to exit : ")) + except ValueError: + print("Please provide valid input") + continue - user_1 = "user@clouddefense.io" - auth = { - "authorized_identities": get_authorized_identities(user_1), - } + if not continue_or_exist: + exit(0) - response = rag_app.ask(prompt, auth) - print(f"Response:\n{response}") + print("\n\n") diff --git a/pebblo_saferetriever/langchain/identity-rag/pinecone_data_loader.py b/pebblo_saferetriever/langchain/identity-rag/pinecone_data_loader.py deleted file mode 100644 index 989d4c7a..00000000 --- a/pebblo_saferetriever/langchain/identity-rag/pinecone_data_loader.py +++ /dev/null @@ -1,59 +0,0 @@ -import time - -from langchain_community.vectorstores import Pinecone as PineconeVectorStore -from pinecone import Pinecone, PodSpec, ServerlessSpec - - -def create_pinecone_index( - pinecone_api_key, index_name, embeddings, documents -) -> PineconeVectorStore: - """ - Create a Pinecone index and load documents into it - """ - # configure client - pc = Pinecone(api_key=pinecone_api_key) - - use_serverless = True - if use_serverless: - spec = ServerlessSpec(cloud="aws", region="us-west-2") - else: - environment = "gcp-starter" - # if not using a starter index, you should specify a pod_type too - spec = PodSpec(environment=environment) - - # check for and delete index if already exists - if index_name in pc.list_indexes().names(): - print(f"Index {index_name} already exists. ") - # Delete and create a new index - pc.delete_index(index_name) - print(f"Deleted index {index_name}.") - - print(f"Creating index {index_name}...") - # create a new index - pc.create_index( - index_name, - dimension=1536, # dimensionality of text-embedding-ada-002 - metric="dotproduct", - spec=spec, - ) - - # wait for index to be initialized - while not pc.describe_index(index_name).status["ready"]: - time.sleep(1) - - index = pc.Index(index_name) - index.describe_index_stats() - - print("Creating embeddings and Loading docs into index...") - texts = [t.page_content for t in documents] - metadatas = [t.metadata for t in documents] - - vector_store = PineconeVectorStore.from_texts( - texts, embeddings, metadatas=metadatas, index_name=index_name - ) - # wait for index to be initialized - print("Waiting for index to be ready...") - time.sleep(15) - - print("Done!") - return vector_store From 4fd1a3de703fc7c1c225183b291a70628d79c635 Mon Sep 17 00:00:00 2001 From: Rajendra Kadam Date: Wed, 20 Mar 2024 13:28:46 +0530 Subject: [PATCH 2/2] Separate data loading and retrieval --- .../identity-rag/pebblo_identity_safeload.py | 78 +++++++++++++++++++ .../pebblo_identity_rag-qdrant.py | 68 +++------------- 2 files changed, 87 insertions(+), 59 deletions(-) create mode 100644 pebblo_safeloader/langchain/identity-rag/pebblo_identity_safeload.py diff --git a/pebblo_safeloader/langchain/identity-rag/pebblo_identity_safeload.py b/pebblo_safeloader/langchain/identity-rag/pebblo_identity_safeload.py new file mode 100644 index 00000000..2ac6b189 --- /dev/null +++ b/pebblo_safeloader/langchain/identity-rag/pebblo_identity_safeload.py @@ -0,0 +1,78 @@ +# data_loader.py +from dotenv import load_dotenv + +from typing import List + +from langchain.schema import Document +from langchain_community.document_loaders import ( + GoogleDriveLoader, + UnstructuredFileIOLoader, +) +from langchain_community.document_loaders.pebblo import PebbloSafeLoader +from langchain_community.vectorstores.qdrant import Qdrant +from langchain_openai.embeddings import OpenAIEmbeddings + +load_dotenv() + +# Qdrant DB path +QDRANT_PATH = "qdrant.db" +# Qdrant DB collection name +COLLECTION_NAME = "identity-enabled-rag" + + +class QdrantDataLoader: + def __init__(self, folder_id: str, collection_name: str = COLLECTION_NAME): + self.app_name = "acme-corp-rag-1" + self.folder_id = folder_id + self.qdrant_collection_name = collection_name + + def load_documents(self): + print("\nLoading RAG documents ...") + loader = PebbloSafeLoader( + GoogleDriveLoader( + folder_id=self.folder_id, + token_path="./google_token.json", + recursive=True, + file_loader_cls=UnstructuredFileIOLoader, + file_loader_kwargs={"mode": "elements"}, + load_auth=True, + ), + name=self.app_name, # App name (Mandatory) + owner="Joe Smith", # Owner (Optional) + description="Identity enabled SafeLoader and SafeRetrival app using Pebblo", # Description (Optional) + ) + documents = loader.load() + for doc in documents: + print(f"{doc.metadata}") + + # print(documents[-1].metadata.get("authorized_identities")) + print(f"Loaded {len(documents)} documents ...\n") + return documents + + def add_docs_to_qdrant(self, documents: List[Document]): + """ + Load documents into Qdrant + """ + print("\nAdding documents to Qdrant ...") + embeddings = OpenAIEmbeddings() + vectordb = Qdrant.from_documents( + documents, + embeddings, + path=QDRANT_PATH, + collection_name=self.qdrant_collection_name, + ) + print(f"Added {len(documents)} documents to Qdrant ...\n") + return vectordb + + +if __name__ == "__main__": + print("Loading documents to Qdrant ...") + # def_folder_id = "1FQ-LrarHhWBJRGHc8yiH2ZtirpUXERYP" + def_folder_id = "15CyFIWOPJOR5BxDID7G6tUisfHU1szrg" + collection_name = "identity-enabled-rag" + + qloader = QdrantDataLoader(def_folder_id, collection_name) + + documents = qloader.load_documents() + + vectordb = qloader.add_docs_to_qdrant(documents) diff --git a/pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag-qdrant.py b/pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag-qdrant.py index 4d55f4b3..14a36304 100644 --- a/pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag-qdrant.py +++ b/pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag-qdrant.py @@ -1,12 +1,5 @@ -import os - from dotenv import load_dotenv from langchain.chains import PebbloRetrievalQA -from langchain_community.document_loaders import ( - GoogleDriveLoader, - UnstructuredFileIOLoader, -) -from langchain_community.document_loaders.pebblo import PebbloSafeLoader from langchain_community.vectorstores.qdrant import Qdrant from langchain_openai import OpenAIEmbeddings from langchain_openai.llms import OpenAI @@ -16,17 +9,11 @@ load_dotenv() -# Get Qdrant API key from environment -QDRANT_API_KEY = os.environ.get("QDRANT_API_KEY", "your-qdrant-api-key") -# Qdrant URL -QDRANT_URL = os.environ.get("QDRANT_URL") +# Qdrant DB path +QDRANT_PATH = "qdrant.db" # Qdrant DB collection name DEFAULT_COLLECTION_NAME = "identity-enabled-rag" -print( - f"QDRANT Config: \n\tKey: {QDRANT_API_KEY[:10]}{'x' * len(QDRANT_API_KEY[10:])}, \n\tUrl: {QDRANT_URL}\n" -) - class PebbloIdentityRAG: def __init__(self, collection_name: str = DEFAULT_COLLECTION_NAME): @@ -38,11 +25,10 @@ def __init__(self, collection_name: str = DEFAULT_COLLECTION_NAME): def init_vector_db(self): """ - Load Vector DB + Load Vector DB from file """ client = QdrantClient( - url=QDRANT_URL, - api_key=QDRANT_API_KEY, + path=QDRANT_PATH, ) vectordb = Qdrant( client=client, @@ -51,41 +37,6 @@ def init_vector_db(self): ) return vectordb - def load_documents_to_vector_db(self, folder_id: str): - """ - Load documents to Qdrant Vector DB - """ - print("Loading documents...") - - loader = PebbloSafeLoader( - GoogleDriveLoader( - folder_id=folder_id, - token_path="./google_token.json", - recursive=True, - file_loader_cls=UnstructuredFileIOLoader, - file_loader_kwargs={"mode": "elements"}, - load_auth=True, - ), - name=self.app_name, # App name (Mandatory) - owner="Joe Smith", # Owner (Optional) - description="Identity enabled SafeLoader and SafeRetrival app using Pebblo", # Description (Optional) - ) - documents = loader.load() - print(f"Loaded {len(documents)} documents ...\n") - print(f"First document: {documents[0]}") - - # Load documents into VectorDB - print("Hydrating Vector DB ...") - - _vectordb = Qdrant.from_documents( - documents, - self.embeddings, - url=QDRANT_URL, - api_key=QDRANT_API_KEY, - collection_name=self.qdrant_collection_name, - ) - print("Finished hydrating Vector DB ...\n") - def ask(self, question: str, auth_context: dict): # Prepare retriever QA chain retriever = PebbloRetrievalQA.from_chain_type( @@ -100,9 +51,8 @@ def ask(self, question: str, auth_context: dict): if __name__ == "__main__": # TODO: pass the actual GoogleDrive folder id - def_folder_id = "1sRvP0j6L6M_Ll0y_8Qp7cFWUOlpdbfN5" def_service_acc_path = "credentials/service-account.json" - def_ingestion_user_email_address = "user@clouddefense.io" + def_ingestion_user_email_address = "admin@clouddefense.io" input_collection_name = "identity-enabled-rag" rag_app = PebbloIdentityRAG(input_collection_name) @@ -116,14 +66,14 @@ def ask(self, question: str, auth_context: dict): input(f"service-account.json path ({def_service_acc_path}) : ") or def_service_acc_path ) - input_folder_id = input(f"Folder id ({def_folder_id}): ") or def_folder_id - # Load documents to Qdrant Vector DB - rag_app.load_documents_to_vector_db(input_folder_id) + def_end_user = "demo-user-hr@daxa.ai" while True: print("Please enter end user details below") - end_user_email_address = input("User email address : ") + end_user_email_address = ( + input(f"User email address ({def_end_user}): ") or def_end_user + ) prompt = input("Please provide the prompt : ") print(f"User: {end_user_email_address}.\nQuery:{prompt}\n")