Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sharepoint loader #361

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions pebblo_safeloader/langchain/identity-rag-sharepoint/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
For the loader changed we need to add config values with o365 credentials

`O365_CLIENT_ID = ********************`

`O365_CLIENT_SECRET = ************************`

`O365_TENANT_ID = **********************************`

Also need to install all requirements.txt packages

These are the following inputs we need:

def_folder_id = "<sharepoint_folder_id>"
def_folder_path = "<sharepoint_folder_path>"
def_file_id = "<sharepoint_file_id>"
def_site_id = "<sharepoint_site_id>"
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from typing import List

from dotenv import load_dotenv
from langchain.schema import Document
# from langchain_community.document_loaders import UnstructuredFileIOLoader
from langchain_community.document_loaders.pebblo import PebbloSafeLoader
from langchain_community.vectorstores.qdrant import Qdrant
from langchain_community.document_loaders.sharepoint import SharePointLoader
from langchain_openai.embeddings import OpenAIEmbeddings

load_dotenv()

# Qdrant DB path
QDRANT_PATH = "qdrant.db"
# Qdrant DB collection name
COLLECTION_NAME = "identity-enabled-rag"

class IdentityBasedSharePointDataLoader:
def __init__(self, folder_id: str, folder_path: str, file_id: str, site_id: str, collection_name: str = COLLECTION_NAME):
self.app_name = "acme-corp-rag-1"
self.folder_id = folder_id
self.file_id = file_id
self.site_id = site_id
self.folder_path = folder_path
self.qdrant_collection_name = collection_name

def load_documents(self):
print("\nLoading RAG documents ...")
loader = PebbloSafeLoader(
SharePointLoader(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have these samples been tested? I am getting the following error on this line:

pebblo/pebblo_safeloader/langchain/identity-rag-sharepoint/pebblo_identity_safeload_sharepoint.py", line 30, in load_documents
      SharePointLoader(
  File "pydantic/main.py", line 339, in pydantic.main.BaseModel.__init__
  File "pydantic/main.py", line 1066, in pydantic.main.validate_model
  File "pydantic/fields.py", line 439, in pydantic.fields.ModelField.get_default
  File "pydantic/env_settings.py", line 39, in pydantic.env_settings.BaseSettings.__init__
  File "pydantic/main.py", line 341, in pydantic.main.BaseModel.__init__
pydantic.error_wrappers.ValidationError: 2 validation errors for _O365Settings
client_id
  field required (type=value_error.missing)
client_secret
  field required (type=value_error.missing)

document_library_id=self.folder_id,
folder_path=self.folder_path,
file_id=self.file_id,
site_id=self.site_id,
auth_with_token=False
),
name=self.app_name, # App name (Mandatory)
owner="Joe Smith", # Owner (Optional)
description="Identity enabled SafeLoader and SafeRetrival app using Pebblo", # Description (Optional)
)
documents = loader.load()
print(documents)
unique_identities = set()
for doc in documents:
unique_identities.update(doc.metadata.get("authorized_identities"))

print(f"Authorized Identities: {list(unique_identities)}")
print(f"Loaded {len(documents)} documents ...\n")
return documents

def add_docs_to_qdrant(self, documents: List[Document]):
"""
Load documents into Qdrant
"""
print("\nAdding documents to Qdrant ...")
embeddings = OpenAIEmbeddings()
vectordb = Qdrant.from_documents(
documents,
embeddings,
path=QDRANT_PATH,
collection_name=self.qdrant_collection_name,
)
print(f"Added {len(documents)} documents to Qdrant ...\n")
return vectordb


if __name__ == "__main__":
print("Loading documents to Qdrant ...")
def_folder_id = "<sharepoint_folder_id>"
def_folder_path = "<sharepoint_folder_path>"
def_file_id = "<sharepoint_file_id>"
def_site_id = "<sharepoint_site_id>"
input_collection_name = "identity-enabled-rag"

qloader = IdentityBasedSharePointDataLoader(def_folder_id, def_folder_path, def_file_id, def_site_id, input_collection_name)

result_documents = qloader.load_documents()

vectordb_obj = qloader.add_docs_to_qdrant(result_documents)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
python-dotenv
requests
langchain
o365
pymupdf
langchain_openai
langchain-community
qdrant-client
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
# Fill-in OPENAI_API_KEY in .env file in this directory before proceeding
from dotenv import load_dotenv
from sharepoint_auth import get_authorized_identities
from langchain.chains import PebbloRetrievalQA
from langchain.chains.pebblo_retrieval.models import AuthContext, ChainInput
from langchain_community.document_loaders import UnstructuredFileIOLoader
from langchain_community.document_loaders.pebblo import PebbloSafeLoader
from langchain_community.vectorstores.qdrant import Qdrant
from langchain_community.document_loaders.sharepoint import SharePointLoader
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_openai.llms import OpenAI
import json
import os

load_dotenv()


class PebbloIdentityRAGSharePoint:
def __init__(self, folder_id: str, folder_path: str, file_id: str, site_id: str, collection_name: str):
self.app_name = "pebblo-identity-rag-1"
self.folder_id = folder_id
self.file_id = file_id
self.site_id = site_id
self.folder_path = folder_path
self.collection_name = collection_name

# Load documents
print("\nLoading RAG documents ...")
self.loader = PebbloSafeLoader(
SharePointLoader(
document_library_id=self.folder_id,
folder_path=self.folder_path,
file_id=self.file_id,
site_id=self.site_id,
auth_with_token=False
),
name=self.app_name, # App name (Mandatory)
owner="Joe Smith", # Owner (Optional)
description="Identity enabled SafeLoader and SafeRetrival app using Pebblo", # Description (Optional)
)
self.documents = self.loader.load()
print(self.documents[-1].metadata.get("authorized_identities"))
print(f"Loaded {len(self.documents)} documents ...\n")

# Load documents into VectorDB

print("Hydrating Vector DB ...")
self.vectordb = self.embeddings()
print("Finished hydrating Vector DB ...\n")

# Prepare LLM
self.llm = OpenAI()
print("Initializing PebbloRetrievalQA ...")
self.retrieval_chain = self.init_retrieval_chain()

def init_retrieval_chain(self):
"""
Initialize PebbloRetrievalQA chain
"""
return PebbloRetrievalQA.from_chain_type(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@arpitkumar980 Update the initialization of PebbloSafeLoader and PebbloRetrievalQA to match the latest sample in the main branch, and then we can merge this PR.

llm=self.llm,
chain_type="stuff",
retriever=self.vectordb.as_retriever(),
verbose=True,
)

def embeddings(self):
embeddings = OpenAIEmbeddings()
vectordb = Qdrant.from_documents(
self.documents,
embeddings,
location=":memory:",
collection_name=self.collection_name,
)
return vectordb

def ask(self, question: str, user_email: str, auth_identifiers: list):
auth_context = {
"username": user_email,
"authorized_identities": auth_identifiers,
}
auth_context = AuthContext(**auth_context)
chain_input = ChainInput(query=question, auth_context=auth_context)

return self.retrieval_chain.invoke(chain_input.dict())


if __name__ == "__main__":
input_collection_name = "identity-enabled-rag"

print("Please enter ingestion user details for loading data...")
ingestion_user_email_address = input("email address : ")
# ingestion_user_service_account_path = input("service-account.json path : ")
input_folder_id = input("Folder id : ")
input_folder_path = input("Folder path : ")
input_file_id = input("File id : ")
input_site_id = input("Site id : ")
input_folder_path = input("Folder path : ")
rag_app = PebbloIdentityRAGSharePoint(
folder_id=input_folder_id, folder_path=input_folder_path, file_id=input_file_id, site_id=input_site_id, collection_name=input_collection_name
)

while True:
print("Please enter end user details below")
end_user_email_address = input("User email address : ")
prompt = input("Please provide the prompt : ")
print(f"User: {end_user_email_address}.\nQuery:{prompt}\n")

ingestion_user_service_account_path = os.path.expanduser('~') + '/.credentials/o365_token.txt'
with open(ingestion_user_service_account_path) as f:
s = f.read()
data = json.loads(s)

access_token = data.get("access_token")
authorized_identities = get_authorized_identities(
# admin_user_email_address=ingestion_user_email_address,
access_token=access_token,
user_email=end_user_email_address,
)
response = rag_app.ask(prompt, end_user_email_address, authorized_identities)
print(f"Response:\n{response}")
try:
continue_or_exist = int(input("\n\nType 1 to continue and 0 to exit : "))
except ValueError:
print("Please provide valid input")
continue

if not continue_or_exist:
exit(0)

print("\n\n")
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
python-dotenv
requests
langchain
o365
pymupdf
langchain_openai
langchain-community
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import requests
import os

def get_authorized_identities(access_token, user_email):

url = f"https://graph.microsoft.com/v1.0/users/{user_email}/memberOf"

payload={}

headers = {
"Authorization": f"Bearer {access_token}"
}

response = requests.request("GET", url, headers=headers, data=payload)

group_names = []

groups_list = response.json()['value']
manually_created_groups = [group for group in groups_list if 'Unified' in group.get('groupTypes', [])]

for group_data in manually_created_groups:
# print(group_data.get('displayName'))
group_names.append(group_data.get('displayName'))

print(f"Authorized : {group_names}")