diff --git a/pebblo_safeloader/langchain/identity-rag-sharepoint/README.md b/pebblo_safeloader/langchain/identity-rag-sharepoint/README.md new file mode 100644 index 00000000..434cdebc --- /dev/null +++ b/pebblo_safeloader/langchain/identity-rag-sharepoint/README.md @@ -0,0 +1,16 @@ +For the loader changed we need to add config values with o365 credentials + +`O365_CLIENT_ID = ********************` + +`O365_CLIENT_SECRET = ************************` + +`O365_TENANT_ID = **********************************` + +Also need to install all requirements.txt packages + +These are the following inputs we need: + +def_folder_id = "" +def_folder_path = "" +def_file_id = "" +def_site_id = "" \ No newline at end of file diff --git a/pebblo_safeloader/langchain/identity-rag-sharepoint/pebblo_identity_safeload_sharepoint.py b/pebblo_safeloader/langchain/identity-rag-sharepoint/pebblo_identity_safeload_sharepoint.py new file mode 100644 index 00000000..a8496327 --- /dev/null +++ b/pebblo_safeloader/langchain/identity-rag-sharepoint/pebblo_identity_safeload_sharepoint.py @@ -0,0 +1,79 @@ +from typing import List + +from dotenv import load_dotenv +from langchain.schema import Document +# from langchain_community.document_loaders import UnstructuredFileIOLoader +from langchain_community.document_loaders.pebblo import PebbloSafeLoader +from langchain_community.vectorstores.qdrant import Qdrant +from langchain_community.document_loaders.sharepoint import SharePointLoader +from langchain_openai.embeddings import OpenAIEmbeddings + +load_dotenv() + +# Qdrant DB path +QDRANT_PATH = "qdrant.db" +# Qdrant DB collection name +COLLECTION_NAME = "identity-enabled-rag" + +class IdentityBasedSharePointDataLoader: + def __init__(self, folder_id: str, folder_path: str, file_id: str, site_id: str, collection_name: str = COLLECTION_NAME): + self.app_name = "acme-corp-rag-1" + self.folder_id = folder_id + self.file_id = file_id + self.site_id = site_id + self.folder_path = folder_path + self.qdrant_collection_name = collection_name + + def load_documents(self): + print("\nLoading RAG documents ...") + loader = PebbloSafeLoader( + SharePointLoader( + document_library_id=self.folder_id, + folder_path=self.folder_path, + file_id=self.file_id, + site_id=self.site_id, + auth_with_token=False + ), + name=self.app_name, # App name (Mandatory) + owner="Joe Smith", # Owner (Optional) + description="Identity enabled SafeLoader and SafeRetrival app using Pebblo", # Description (Optional) + ) + documents = loader.load() + print(documents) + unique_identities = set() + for doc in documents: + unique_identities.update(doc.metadata.get("authorized_identities")) + + print(f"Authorized Identities: {list(unique_identities)}") + print(f"Loaded {len(documents)} documents ...\n") + return documents + + def add_docs_to_qdrant(self, documents: List[Document]): + """ + Load documents into Qdrant + """ + print("\nAdding documents to Qdrant ...") + embeddings = OpenAIEmbeddings() + vectordb = Qdrant.from_documents( + documents, + embeddings, + path=QDRANT_PATH, + collection_name=self.qdrant_collection_name, + ) + print(f"Added {len(documents)} documents to Qdrant ...\n") + return vectordb + + +if __name__ == "__main__": + print("Loading documents to Qdrant ...") + def_folder_id = "" + def_folder_path = "" + def_file_id = "" + def_site_id = "" + input_collection_name = "identity-enabled-rag" + + qloader = IdentityBasedSharePointDataLoader(def_folder_id, def_folder_path, def_file_id, def_site_id, input_collection_name) + + result_documents = qloader.load_documents() + + vectordb_obj = qloader.add_docs_to_qdrant(result_documents) diff --git a/pebblo_safeloader/langchain/identity-rag-sharepoint/requirements.txt b/pebblo_safeloader/langchain/identity-rag-sharepoint/requirements.txt new file mode 100644 index 00000000..7e71aab6 --- /dev/null +++ b/pebblo_safeloader/langchain/identity-rag-sharepoint/requirements.txt @@ -0,0 +1,8 @@ +python-dotenv +requests +langchain +o365 +pymupdf +langchain_openai +langchain-community +qdrant-client \ No newline at end of file diff --git a/pebblo_saferetriever/langchain/identity-rag-sharepoint/pebblo_identity_rag_sharepoint.py b/pebblo_saferetriever/langchain/identity-rag-sharepoint/pebblo_identity_rag_sharepoint.py new file mode 100644 index 00000000..09dd1956 --- /dev/null +++ b/pebblo_saferetriever/langchain/identity-rag-sharepoint/pebblo_identity_rag_sharepoint.py @@ -0,0 +1,131 @@ +# Fill-in OPENAI_API_KEY in .env file in this directory before proceeding +from dotenv import load_dotenv +from sharepoint_auth import get_authorized_identities +from langchain.chains import PebbloRetrievalQA +from langchain.chains.pebblo_retrieval.models import AuthContext, ChainInput +from langchain_community.document_loaders import UnstructuredFileIOLoader +from langchain_community.document_loaders.pebblo import PebbloSafeLoader +from langchain_community.vectorstores.qdrant import Qdrant +from langchain_community.document_loaders.sharepoint import SharePointLoader +from langchain_openai.embeddings import OpenAIEmbeddings +from langchain_openai.llms import OpenAI +import json +import os + +load_dotenv() + + +class PebbloIdentityRAGSharePoint: + def __init__(self, folder_id: str, folder_path: str, file_id: str, site_id: str, collection_name: str): + self.app_name = "pebblo-identity-rag-1" + self.folder_id = folder_id + self.file_id = file_id + self.site_id = site_id + self.folder_path = folder_path + self.collection_name = collection_name + + # Load documents + print("\nLoading RAG documents ...") + self.loader = PebbloSafeLoader( + SharePointLoader( + document_library_id=self.folder_id, + folder_path=self.folder_path, + file_id=self.file_id, + site_id=self.site_id, + auth_with_token=False + ), + name=self.app_name, # App name (Mandatory) + owner="Joe Smith", # Owner (Optional) + description="Identity enabled SafeLoader and SafeRetrival app using Pebblo", # Description (Optional) + ) + self.documents = self.loader.load() + print(self.documents[-1].metadata.get("authorized_identities")) + print(f"Loaded {len(self.documents)} documents ...\n") + + # Load documents into VectorDB + + print("Hydrating Vector DB ...") + self.vectordb = self.embeddings() + print("Finished hydrating Vector DB ...\n") + + # Prepare LLM + self.llm = OpenAI() + print("Initializing PebbloRetrievalQA ...") + self.retrieval_chain = self.init_retrieval_chain() + + def init_retrieval_chain(self): + """ + Initialize PebbloRetrievalQA chain + """ + return PebbloRetrievalQA.from_chain_type( + llm=self.llm, + chain_type="stuff", + retriever=self.vectordb.as_retriever(), + verbose=True, + ) + + def embeddings(self): + embeddings = OpenAIEmbeddings() + vectordb = Qdrant.from_documents( + self.documents, + embeddings, + location=":memory:", + collection_name=self.collection_name, + ) + return vectordb + + def ask(self, question: str, user_email: str, auth_identifiers: list): + auth_context = { + "username": user_email, + "authorized_identities": auth_identifiers, + } + auth_context = AuthContext(**auth_context) + chain_input = ChainInput(query=question, auth_context=auth_context) + + return self.retrieval_chain.invoke(chain_input.dict()) + + +if __name__ == "__main__": + input_collection_name = "identity-enabled-rag" + + print("Please enter ingestion user details for loading data...") + ingestion_user_email_address = input("email address : ") + # ingestion_user_service_account_path = input("service-account.json path : ") + input_folder_id = input("Folder id : ") + input_folder_path = input("Folder path : ") + input_file_id = input("File id : ") + input_site_id = input("Site id : ") + input_folder_path = input("Folder path : ") + rag_app = PebbloIdentityRAGSharePoint( + folder_id=input_folder_id, folder_path=input_folder_path, file_id=input_file_id, site_id=input_site_id, collection_name=input_collection_name + ) + + while True: + print("Please enter end user details below") + end_user_email_address = input("User email address : ") + prompt = input("Please provide the prompt : ") + print(f"User: {end_user_email_address}.\nQuery:{prompt}\n") + + ingestion_user_service_account_path = os.path.expanduser('~') + '/.credentials/o365_token.txt' + with open(ingestion_user_service_account_path) as f: + s = f.read() + data = json.loads(s) + + access_token = data.get("access_token") + authorized_identities = get_authorized_identities( + # admin_user_email_address=ingestion_user_email_address, + access_token=access_token, + user_email=end_user_email_address, + ) + response = rag_app.ask(prompt, end_user_email_address, authorized_identities) + print(f"Response:\n{response}") + try: + continue_or_exist = int(input("\n\nType 1 to continue and 0 to exit : ")) + except ValueError: + print("Please provide valid input") + continue + + if not continue_or_exist: + exit(0) + + print("\n\n") diff --git a/pebblo_saferetriever/langchain/identity-rag-sharepoint/requirements.txt b/pebblo_saferetriever/langchain/identity-rag-sharepoint/requirements.txt new file mode 100644 index 00000000..38c01cd0 --- /dev/null +++ b/pebblo_saferetriever/langchain/identity-rag-sharepoint/requirements.txt @@ -0,0 +1,7 @@ +python-dotenv +requests +langchain +o365 +pymupdf +langchain_openai +langchain-community \ No newline at end of file diff --git a/pebblo_saferetriever/langchain/identity-rag-sharepoint/sharepoint_auth.py b/pebblo_saferetriever/langchain/identity-rag-sharepoint/sharepoint_auth.py new file mode 100644 index 00000000..0ee2c60d --- /dev/null +++ b/pebblo_saferetriever/langchain/identity-rag-sharepoint/sharepoint_auth.py @@ -0,0 +1,25 @@ +import requests +import os + +def get_authorized_identities(access_token, user_email): + + url = f"https://graph.microsoft.com/v1.0/users/{user_email}/memberOf" + + payload={} + + headers = { + "Authorization": f"Bearer {access_token}" + } + + response = requests.request("GET", url, headers=headers, data=payload) + + group_names = [] + + groups_list = response.json()['value'] + manually_created_groups = [group for group in groups_list if 'Unified' in group.get('groupTypes', [])] + + for group_data in manually_created_groups: + # print(group_data.get('displayName')) + group_names.append(group_data.get('displayName')) + + print(f"Authorized : {group_names}") \ No newline at end of file