From 8c6254c736565a3444aa0b34b7a310f8817ccd3e Mon Sep 17 00:00:00 2001 From: Rajendra Kadam Date: Fri, 15 Nov 2024 13:43:55 +0530 Subject: [PATCH 1/2] Added GoogleDrive-Pinecone Sample --- .../googledrive-pinecone/.env.sample | 13 + .../googledrive-pinecone/.gitignore | 2 + .../googledrive-pinecone/README.md | 69 ++++++ .../googledrive-pinecone/google_auth.py | 32 +++ .../googledrive-pinecone/pebblo_saferag.py | 223 ++++++++++++++++++ .../googledrive-pinecone/pinecone_index.py | 48 ++++ .../googledrive-pinecone/requirements.txt | 6 + .../googledrive-pinecone/utils.py | 46 ++++ 8 files changed, 439 insertions(+) create mode 100644 pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/.env.sample create mode 100644 pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/.gitignore create mode 100644 pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/README.md create mode 100644 pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/google_auth.py create mode 100644 pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/pebblo_saferag.py create mode 100644 pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/pinecone_index.py create mode 100644 pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/requirements.txt create mode 100644 pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/utils.py diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/.env.sample b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/.env.sample new file mode 100644 index 00000000..ea3b99b3 --- /dev/null +++ b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/.env.sample @@ -0,0 +1,13 @@ +# OpenAI credentials +OPENAI_API_KEY="" + +# Pebblo configuration +PEBBLO_CLASSIFIER_URL="" # e.g "http://localhost:8000/" +PEBBLO_API_KEY= +PEBBLO_CLOUD_URL= + +# Google Drive Config +GOOGLE_APPLICATION_CREDENTIALS="" + +# Vector DB Config +PINECONE_API_KEY="" \ No newline at end of file diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/.gitignore b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/.gitignore new file mode 100644 index 00000000..30de5903 --- /dev/null +++ b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/.gitignore @@ -0,0 +1,2 @@ +credentials +google_token.json \ No newline at end of file diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/README.md b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/README.md new file mode 100644 index 00000000..3a1ef486 --- /dev/null +++ b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/README.md @@ -0,0 +1,69 @@ +## Identity and Semantic Enforcement using Pebblo + +This solution uses the following daxa/langchain and daxa/langchain-google branches: + +- daxa-ai/langchain: https://github.com/daxa-ai/langchain/tree/pebblo-0.1.21 +- daxa-ai/langchain-google: https://github.com/daxa-ai/langchain-google/tree/pebblo-0.1.21 + +### Prerequisites + +1. Sign up and set up your account on Pinecone (https://www.pinecone.io/). + +### Instructions + +1. Create Python virtual-env + +```console +$ python3 -m venv .venv +$ source .venv/bin/activate +``` + +2. Install dependencies + +```console +$ pip3 install -r requirements.txt +``` + +3. Install langchain-core and langchain-community from the branch `pebblo-0.1.21` + +```console +$ git clone https://github.com/daxa-ai/langchain.git +$ cd langchain +$ git fetch && git checkout pebblo-0.1.21 +$ cd libs/community +$ pip3 install langchain-community . +$ cd ../core +$ pip3 install langchain-core . +``` + +4. Install langchain-google from the branch `pebblo-0.1.21` + +```console +$ git clone https://github.com/daxa-ai/langchain-google.git +$ cd langchain-google +$ git fetch && git checkout pebblo-0.1.21 +$ cd libs/community +$ pip3 install langchain-google-community . +``` + +5.Copy the `.env.sample` file to `.env` and populate the necessary environment variable. + +5. Update the `pebblo_saferag.py` file with the following details: + +- _folder_id_: Google Drive folder ID where the documents are stored +- _service_acc_def_: Google service account credentials file path +- _ing_user_email_def_: Google Drive Admin/Ingestion user email ID + + +5. Run langchain sample app PebbloSafeLoader and PebbloRetrievalQA + +```console +$ python3 pebblo_saferag.py +``` + +6. Retrieve the Pebblo PDF report in `$HOME/.pebblo/pebblo-identity-n-semantic-loader-pinecone/pebblo_report.pdf` file path on the system + where `Pebblo Server` is + running. + +7. To access the Pebblo UI, point the browser to `https://localhost:8000/pebblo` or `host:port/pebblo` if you are running the server on a different + host. diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/google_auth.py b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/google_auth.py new file mode 100644 index 00000000..0d43de48 --- /dev/null +++ b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/google_auth.py @@ -0,0 +1,32 @@ +from typing import List + +from google.oauth2 import service_account +from googleapiclient.discovery import build + + +def get_authorized_identities( + admin_user_email_address: str, credentials_file_path: str, user_email: str +) -> List[str]: + """ + Get authorized identities from Google Directory API + """ + _authorized_identities = [user_email] + credentials = service_account.Credentials.from_service_account_file( + credentials_file_path, + scopes=[ + "https://www.googleapis.com/auth/admin.directory.group.readonly", + "https://www.googleapis.com/auth/admin.directory.group", + ], + subject=admin_user_email_address, + ) + directory_service = build("admin", "directory_v1", credentials=credentials) + + try: + groups = directory_service.groups().list(userKey=user_email).execute() + for group in groups.get("groups", []): + group_email = group["email"] + _authorized_identities.append(group_email) + except Exception as e: + print(f"Error in : {e}") + print(f"User: {user_email}, \nAuthorized Identities: {_authorized_identities}\n") + return _authorized_identities diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/pebblo_saferag.py b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/pebblo_saferag.py new file mode 100644 index 00000000..2b1058f9 --- /dev/null +++ b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/pebblo_saferag.py @@ -0,0 +1,223 @@ +""" +Sample app to demonstrate the usage of PebbloSafeLoader, and PebbloRetrievalQA +for semantic enforcement using Pinecone VectorDB in RAG. +""" + +import time +from pathlib import Path +from typing import List, Optional + +from dotenv import load_dotenv +from google_auth import get_authorized_identities +from langchain_community.chains import PebbloRetrievalQA +from langchain_community.chains.pebblo_retrieval.models import ( + AuthContext, + ChainInput, + SemanticContext, +) +from langchain_community.document_loaders import UnstructuredFileIOLoader +from langchain_community.document_loaders.pebblo import PebbloSafeLoader +from langchain_community.vectorstores.pinecone import Pinecone as PineconeVectorStore +from langchain_google_community import GoogleDriveLoader +from langchain_openai.embeddings import OpenAIEmbeddings +from langchain_openai.llms import OpenAI +from pinecone_index import create_pinecone_index +from utils import describe_pebblo_semantic_stats, format_text, get_input_as_list + +load_dotenv() + + +class SafeRetrieverSemanticRAG: + """ + Sample app to demonstrate the usage of PebbloSafeLoader, and PebbloRetrievalQA + for semantic enforcement using Pinecone VectorDB in RAG. + + + Args: + folder_id (str): Google Drive folder id + index_name (str): Index name for Pinecone + """ + + def __init__(self, folder_id: str, index_name: str): + self.loader_app_name = "pebblo-identity-n-semantic-loader-pinecone" + self.retrieval_app_name = "pebblo-identity-n-semantic-retriever-pinecone" + self.folder_id = folder_id + self.pinecone_index_name = index_name + # Prepare LLM + self.llm = OpenAI() + self.embeddings = OpenAIEmbeddings() + # Load documents from Google Drive + self.documents = self.load_documents() + # Initialize VectorDB + self.vectordb = self.init_vector_db() + # Initialize PebbloRetrievalQA + self.retrieval_chain = self.init_retrieval_chain() + + def load_documents(self): + """ + Load documents from Google Drive + """ + print("\nLoading RAG documents ...") + loader = PebbloSafeLoader( + GoogleDriveLoader( + folder_id=self.folder_id, + credentials_path=Path("credentials/credentials.json"), + token_path=Path("./google_token.json"), + recursive=True, + file_loader_cls=UnstructuredFileIOLoader, + file_loader_kwargs={"mode": "elements"}, + load_auth=True, + ), + name=self.loader_app_name, # App name (Mandatory) + owner="Joe Smith", # Owner (Optional) + description="Identity enabled SafeLoader app using Pebblo and Pinecone VectorDB", # Description (Optional) + load_semantic=True, + ) + documents = loader.load() + print(f"Loaded {len(documents)} documents ...\n") + describe_pebblo_semantic_stats(documents) + return documents + + def init_vector_db(self) -> PineconeVectorStore: + """ + Create a Pinecone index and load documents into it + """ + # Create index + create_pinecone_index(self.pinecone_index_name, recreate=True) + + print("Loading docs into index...") + texts = [t.page_content for t in self.documents] + metadatas = [t.metadata for t in self.documents] + + # pop "coordinates" from metadata(Nested JSONs are not supported in Pinecone) + for metadata in metadatas: + metadata.pop("coordinates", None) + + vector_store = PineconeVectorStore.from_texts( + texts, + self.embeddings, + metadatas=metadatas, + index_name=self.pinecone_index_name, + ) + + # wait for index to be initialized + print("Waiting for index to be ready...") + time.sleep(5) + + print("Done!") + return vector_store + + def init_retrieval_chain(self): + """ + Initialize PebbloRetrievalQA chain + """ + return PebbloRetrievalQA.from_chain_type( + llm=self.llm, + app_name=self.retrieval_app_name, + owner="Joe Smith", + description="Identity enabled SafeLoader and SafeRetrival app using " + "Pebblo and Pinecone VectorDB", + chain_type="stuff", + retriever=self.vectordb.as_retriever(), + verbose=True, + ) + + def ask( + self, + question: str, + user_email: str, + auth_identifiers: List[str], + topics_to_deny: Optional[List[str]] = None, + entities_to_deny: Optional[List[str]] = None, + ): + """ + Ask a question + """ + auth_context = { + "user_id": user_email, + "user_auth": auth_identifiers, + } + auth_context = AuthContext(**auth_context) + semantic_context = dict() + if topics_to_deny: + semantic_context["pebblo_semantic_topics"] = {"deny": topics_to_deny} + if entities_to_deny: + semantic_context["pebblo_semantic_entities"] = {"deny": entities_to_deny} + + semantic_context = ( + SemanticContext(**semantic_context) if semantic_context else None + ) + + chain_input = ChainInput( + query=question, auth_context=auth_context, semantic_context=semantic_context + ) + + return self.retrieval_chain.invoke(chain_input.dict()) + + +if __name__ == "__main__": + input_index_name = "identity-semantic-enforcement-rag" + folder_id = "" + service_acc_def = "credentials/service-account.json" + ing_user_email_def = "" + + print("Please enter ingestion user details for loading data...") + print("Please enter admin user details...") + ingestion_user_email_address = ( + input(f"email address ({ing_user_email_def}): ") or ing_user_email_def + ) + ingestion_user_service_account_path = ( + input(f"service-account.json path ({service_acc_def}): ") or service_acc_def + ) + rag_app = SafeRetrieverSemanticRAG(folder_id, input_index_name) + + while True: + print("Please enter end user details below") + end_user_email_address = input("User email address : ") + + auth_identifiers = get_authorized_identities( + admin_user_email_address=ingestion_user_email_address, + credentials_file_path=ingestion_user_service_account_path, + user_email=end_user_email_address, + ) + + print( + "Please enter semantic filters below...\n" + "(Leave these fields empty if you do not wish to enforce any semantic filters)" + ) + topic_to_deny = get_input_as_list( + "Topics to deny, comma separated (Optional): " + ) + entity_to_deny = get_input_as_list( + "Entities to deny, comma separated (Optional): " + ) + + prompt = input("Please provide the prompt: ") + print( + f"User: {end_user_email_address}.\n" + f"\nTopics to deny: {topic_to_deny}\n" + f"Entities to deny: {entity_to_deny}\n" + f"Query: {format_text(prompt)}" + ) + response = rag_app.ask( + prompt, + end_user_email_address, + auth_identifiers, + topic_to_deny, + entity_to_deny, + ) + + print(f"Response:\n" f"{format_text(response['result'])}") + + try: + continue_or_exist = int( + input("\n\nType 1 to continue and 0 to exit (1): ") or 1 + ) + except ValueError: + print("Please provide valid input") + continue + + if not continue_or_exist: + exit(0) + + print("\n\n") diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/pinecone_index.py b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/pinecone_index.py new file mode 100644 index 00000000..a3f76f69 --- /dev/null +++ b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/pinecone_index.py @@ -0,0 +1,48 @@ +import os +import time + +from dotenv import load_dotenv +from pinecone import Pinecone, PodSpec + +load_dotenv() + +PINECONE_API_KEY = os.getenv("PINECONE_API_KEY") + + +def create_pinecone_index(pinecone_index_name: str, recreate: bool = False): + """ + Create a new Pinecone index + """ + + # configure client + pc = Pinecone(api_key=PINECONE_API_KEY) + # Update the environment/PodSpec to match the one you have access to + environment = "gcp-starter" + spec = PodSpec(environment=environment) + + # check for and delete index if already exists + if pinecone_index_name in pc.list_indexes().names(): + if not recreate: + print(f"Index {pinecone_index_name} already exists. skipping...") + return + else: + # Delete and create a new index + print(f"Deleting and recreating index: {pinecone_index_name} ...") + pc.delete_index(pinecone_index_name) + print(f"Deleted index: {pinecone_index_name}.") + + print(f"Creating index: {pinecone_index_name}...") + # create a new index + pc.create_index( + pinecone_index_name, + dimension=1536, # dimensionality of text-embedding-ada-002 + metric="dotproduct", + spec=spec, + ) + + # wait for index to be initialized + while not pc.describe_index(pinecone_index_name).status["ready"]: + time.sleep(1) + + index = pc.Index(pinecone_index_name) + index.describe_index_stats() diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/requirements.txt b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/requirements.txt new file mode 100644 index 00000000..acf5e7ba --- /dev/null +++ b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/requirements.txt @@ -0,0 +1,6 @@ +python-dotenv==1.0.0 +requests==2.31.0 +unstructured +google-api-python-client # For Google Auth +langchain-openai +pinecone-client # for Pinecone VectorStore \ No newline at end of file diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/utils.py b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/utils.py new file mode 100644 index 00000000..380583b9 --- /dev/null +++ b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/utils.py @@ -0,0 +1,46 @@ +import textwrap +from typing import List, Optional + + +def describe_pebblo_semantic_stats(documents: list) -> None: + """ + Describe the semantic stats of the documents + """ + unique_identities = set() + unique_topics = set() + unique_entities = set() + + for doc in documents: + unique_identities.update(doc.metadata.get("authorized_identities", [])) + unique_topics.update(doc.metadata.get("pebblo_semantic_topics", [])) + unique_entities.update(doc.metadata.get("pebblo_semantic_entities", [])) + + print("\nIndentity and Semantic Stats:") + print(f"Authorized Identities: {list(unique_identities)}") + print(f"Semantic Topics: {list(unique_topics)}") + print(f"Semantic Entities: {list(unique_entities)}") + print("\n") + + +def format_text(text: str, width: int = 120): + """ + Format the text to a given width + """ + formatted_text = textwrap.fill( + text, + width=width, + fix_sentence_endings=True, + replace_whitespace=False, + ) + return formatted_text + + +def get_input_as_list(prompt_text: str) -> Optional[List[str]]: + """ + Get user input as list + """ + user_input = input(prompt_text) + if user_input: + return [item.strip() for item in user_input.split(",") if item.strip()] + else: + return None From ff07941ab3bd8d3e1b622d84c27219e664ed68bf Mon Sep 17 00:00:00 2001 From: Rajendra Kadam Date: Fri, 15 Nov 2024 16:56:53 +0530 Subject: [PATCH 2/2] Update Pinecone package --- .../googledrive-pinecone/README.md | 93 +++++++++---------- .../googledrive-pinecone/google_auth.py | 4 +- .../googledrive-pinecone/pebblo_saferag.py | 15 +-- .../googledrive-pinecone/requirements.txt | 3 +- 4 files changed, 54 insertions(+), 61 deletions(-) diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/README.md b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/README.md index 3a1ef486..c8dfddad 100644 --- a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/README.md +++ b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/README.md @@ -6,64 +6,55 @@ This solution uses the following daxa/langchain and daxa/langchain-google branch - daxa-ai/langchain-google: https://github.com/daxa-ai/langchain-google/tree/pebblo-0.1.21 ### Prerequisites +1. Google Cloud project. Follow [LangChain GoogleDrive loader](https://python.langchain.com/v0.2/docs/integrations/document_loaders/google_drive/#prerequisites) docs for details on specific steps required to be completed in Google Cloud. +2. Sign up and set up your account on Pinecone (https://www.pinecone.io/). -1. Sign up and set up your account on Pinecone (https://www.pinecone.io/). ### Instructions 1. Create Python virtual-env - -```console -$ python3 -m venv .venv -$ source .venv/bin/activate -``` + ```console + $ python3 -m venv .venv + $ source .venv/bin/activate + ``` 2. Install dependencies - -```console -$ pip3 install -r requirements.txt -``` - -3. Install langchain-core and langchain-community from the branch `pebblo-0.1.21` - -```console -$ git clone https://github.com/daxa-ai/langchain.git -$ cd langchain -$ git fetch && git checkout pebblo-0.1.21 -$ cd libs/community -$ pip3 install langchain-community . -$ cd ../core -$ pip3 install langchain-core . -``` + ```console + $ pip3 install -r requirements.txt + ``` + +3. Install langchain-community from the branch `pebblo-0.1.21` + ```console + $ git clone https://github.com/daxa-ai/langchain.git + $ cd langchain + $ git fetch && git checkout pebblo-0.1.21 + $ cd libs/community + $ pip3 install langchain-community . + ``` 4. Install langchain-google from the branch `pebblo-0.1.21` - -```console -$ git clone https://github.com/daxa-ai/langchain-google.git -$ cd langchain-google -$ git fetch && git checkout pebblo-0.1.21 -$ cd libs/community -$ pip3 install langchain-google-community . -``` - -5.Copy the `.env.sample` file to `.env` and populate the necessary environment variable. - -5. Update the `pebblo_saferag.py` file with the following details: - -- _folder_id_: Google Drive folder ID where the documents are stored -- _service_acc_def_: Google service account credentials file path -- _ing_user_email_def_: Google Drive Admin/Ingestion user email ID - - -5. Run langchain sample app PebbloSafeLoader and PebbloRetrievalQA - -```console -$ python3 pebblo_saferag.py -``` - -6. Retrieve the Pebblo PDF report in `$HOME/.pebblo/pebblo-identity-n-semantic-loader-pinecone/pebblo_report.pdf` file path on the system - where `Pebblo Server` is - running. - -7. To access the Pebblo UI, point the browser to `https://localhost:8000/pebblo` or `host:port/pebblo` if you are running the server on a different + ```console + $ git clone https://github.com/daxa-ai/langchain-google.git + $ cd langchain-google + $ git fetch && git checkout pebblo-0.1.21 + $ cd libs/community + $ pip3 install langchain-google-community . + ``` + +5. Copy the `.env.sample` file to `.env` and populate the necessary environment variable. + +6. Update the `pebblo_saferag.py` file with the following details: + - _service_acc_def_: Google service account credentials file path + - _folder_id_: Google Drive folder ID where the documents are stored + - _ing_user_email_def_: Google Drive Admin/Ingestion user email ID + +7. Run langchain sample app PebbloSafeLoader and PebbloRetrievalQA + ```console + $ python3 pebblo_saferag.py + ``` + +8. Retrieve the Pebblo PDF report in `$HOME/.pebblo/pebblo-identity-n-semantic-loader-pinecone/pebblo_report.pdf` file path on the system + where `Pebblo Server` is running. + +9. To access the Pebblo UI, point the browser to `https://localhost:8000/pebblo` or `host:port/pebblo` if you are running the server on a different host. diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/google_auth.py b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/google_auth.py index 0d43de48..ae928f5a 100644 --- a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/google_auth.py +++ b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/google_auth.py @@ -5,14 +5,14 @@ def get_authorized_identities( - admin_user_email_address: str, credentials_file_path: str, user_email: str + admin_user_email_address: str, service_account_file_path: str, user_email: str ) -> List[str]: """ Get authorized identities from Google Directory API """ _authorized_identities = [user_email] credentials = service_account.Credentials.from_service_account_file( - credentials_file_path, + service_account_file_path, scopes=[ "https://www.googleapis.com/auth/admin.directory.group.readonly", "https://www.googleapis.com/auth/admin.directory.group", diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/pebblo_saferag.py b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/pebblo_saferag.py index 2b1058f9..255c50d3 100644 --- a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/pebblo_saferag.py +++ b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/pebblo_saferag.py @@ -2,6 +2,7 @@ Sample app to demonstrate the usage of PebbloSafeLoader, and PebbloRetrievalQA for semantic enforcement using Pinecone VectorDB in RAG. """ +import os import time from pathlib import Path @@ -17,7 +18,7 @@ ) from langchain_community.document_loaders import UnstructuredFileIOLoader from langchain_community.document_loaders.pebblo import PebbloSafeLoader -from langchain_community.vectorstores.pinecone import Pinecone as PineconeVectorStore +from langchain_pinecone import PineconeVectorStore from langchain_google_community import GoogleDriveLoader from langchain_openai.embeddings import OpenAIEmbeddings from langchain_openai.llms import OpenAI @@ -26,7 +27,6 @@ load_dotenv() - class SafeRetrieverSemanticRAG: """ Sample app to demonstrate the usage of PebbloSafeLoader, and PebbloRetrievalQA @@ -64,7 +64,7 @@ def load_documents(self): credentials_path=Path("credentials/credentials.json"), token_path=Path("./google_token.json"), recursive=True, - file_loader_cls=UnstructuredFileIOLoader, + # file_loader_cls=UnstructuredFileIOLoader, file_loader_kwargs={"mode": "elements"}, load_auth=True, ), @@ -164,11 +164,12 @@ def ask( print("Please enter ingestion user details for loading data...") print("Please enter admin user details...") ingestion_user_email_address = ( - input(f"email address ({ing_user_email_def}): ") or ing_user_email_def + input(f"Email address ({ing_user_email_def}): ") or ing_user_email_def ) - ingestion_user_service_account_path = ( - input(f"service-account.json path ({service_acc_def}): ") or service_acc_def + service_account_file_path = ( + input(f"Path to the service_account.json file ({service_acc_def}): ") or service_acc_def ) + folder_id = input(f"Google Drive folder id ({folder_id}): ") or folder_id rag_app = SafeRetrieverSemanticRAG(folder_id, input_index_name) while True: @@ -177,7 +178,7 @@ def ask( auth_identifiers = get_authorized_identities( admin_user_email_address=ingestion_user_email_address, - credentials_file_path=ingestion_user_service_account_path, + service_account_file_path=service_account_file_path, user_email=end_user_email_address, ) diff --git a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/requirements.txt b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/requirements.txt index acf5e7ba..b141eb38 100644 --- a/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/requirements.txt +++ b/pebblo_saferetriever/langchain/pebblo-saferag/googledrive-pinecone/requirements.txt @@ -3,4 +3,5 @@ requests==2.31.0 unstructured google-api-python-client # For Google Auth langchain-openai -pinecone-client # for Pinecone VectorStore \ No newline at end of file +pinecone-client # for Pinecone VectorStore +langchain-pinecone