daxa-ai · shreyas-damle · Mar 15, 2024 · Mar 15, 2024
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -6,7 +6,7 @@ on:
     paths:
       - "pebblo/**/*.py"
       - "tests/**/*.py"
-      - "samples/**/*.py"
+      - "pebblo_safeloader/**/*.py"
 
 jobs:
 

diff --git a/.github/workflows/linting_skipper.yml b/.github/workflows/linting_skipper.yml
@@ -8,7 +8,8 @@ on:
       - "docs/**/*.png"
       - "docs/**/*.jpg"
       - "docs/**/*.yml"
-      - "samples/**/*.md"
+      - "pebblo_safeloader/**/*.md"
+      - "pebblo_saferetriever/**/*.md"
       - "**/pyproject.toml"
 
 jobs:

diff --git a/Makefile b/Makefile
@@ -41,7 +41,7 @@ lint:
 	ruff check .
 	ruff format . --diff
 	ruff --select I .
-	mkdir -p $(MYPY_CACHE) && mypy --install-types --non-interactive $(PYTHON_FILES) --cache-dir $(MYPY_CACHE) --exclude build/
+	mkdir -p $(MYPY_CACHE) && mypy --install-types --non-interactive $(PYTHON_FILES) --cache-dir $(MYPY_CACHE) --exclude build/ --exclude pebblo_saferetriever
 
 spell_check:
 	codespell --toml pyproject.toml

diff --git a/README.md b/README.md
@@ -87,7 +87,7 @@ The Pebblo SafeLoader can be enabled with few lines of code change to the above
     vectordb = Chroma.from_documents(documents, OpenAIEmbeddings())
 ```
 
-See [here](https://github.com/srics/pebblo/tree/main/samples) for samples with Pebblo enabled RAG applications and [this](https://daxa-ai.github.io/pebblo/rag) document for more details.
+See [here](https://github.com/srics/pebblo/tree/main/pebblo_safeloader) for samples with Pebblo enabled RAG applications and [this](https://daxa-ai.github.io/pebblo/rag) document for more details.
 
 # Contribution
 

diff --git a/pebblo_safeloader/README.md b/pebblo_safeloader/README.md
@@ -0,0 +1,3 @@
+# Overview
+
+`Pebblo SafeLoader` provides visibility and enforcement for Semantic, Entity, and Identity of data ingested into RAG applications.
diff --git a/samples/langchain/acme-corp-rag/.env → ...o_safeloader/langchain/acme-corp-rag/.env b/samples/langchain/acme-corp-rag/.env → ...o_safeloader/langchain/acme-corp-rag/.env
diff --git a/samples/langchain/acme-corp-rag/README.md → ...eloader/langchain/acme-corp-rag/README.md b/samples/langchain/acme-corp-rag/README.md → ...eloader/langchain/acme-corp-rag/README.md
diff --git a/.../langchain/acme-corp-rag/acme_corp_rag.py → .../langchain/acme-corp-rag/acme_corp_rag.py b/.../langchain/acme-corp-rag/acme_corp_rag.py → .../langchain/acme-corp-rag/acme_corp_rag.py
diff --git a/...ain/acme-corp-rag/acme_corp_rag_pebblo.py → ...ain/acme-corp-rag/acme_corp_rag_pebblo.py b/...ain/acme-corp-rag/acme_corp_rag_pebblo.py → ...ain/acme-corp-rag/acme_corp_rag_pebblo.py
diff --git a/...ngchain/acme-corp-rag/data/topic_data.csv → ...ngchain/acme-corp-rag/data/topic_data.csv b/...ngchain/acme-corp-rag/data/topic_data.csv → ...ngchain/acme-corp-rag/data/topic_data.csv
diff --git a/.../langchain/acme-corp-rag/requirements.txt → .../langchain/acme-corp-rag/requirements.txt b/.../langchain/acme-corp-rag/requirements.txt → .../langchain/acme-corp-rag/requirements.txt
diff --git a/pebblo_saferetriever/README.md b/pebblo_saferetriever/README.md
@@ -0,0 +1,3 @@
+# Overview
+
+`Pebblo SafeRetriever` provides visibility and enforcement for Semantic, Entity, and Identity of prompts and inference response for RAG applications.
diff --git a/pebblo_saferetriever/langchain/identity-rag/README.md b/pebblo_saferetriever/langchain/identity-rag/README.md
@@ -0,0 +1,46 @@
+
+## Identity Enforcement using Pebblo
+
+This solution uses the following two proposed PRs to LangChain:
+
+1. community: add authorization identities to GoogleDriveLoader #18813
+https://github.com/langchain-ai/langchain/pull/18813
+
+2. langchain: add PebbloRetrievalQA chain with Identity & Semantic enforcement #18812
+https://github.com/langchain-ai/langchain/pull/18812
+
+
+### Instructions
+
+1. Create Python virtual-env
+
+
+```console
+$ python3 -m venv .venv
+$ source .venv/bin/activate
+```
+
+2. Install dependencies
+
+```console
+$ pip3 install -r requirements.txt
+```
+
+> Note: requirements.txt includes the necessary `pebblo-langchain` package
+
+3. Populate your `OPENAI_API_KEY` and, if needed, `PEBBLO_CLASSIFIER_URL` in .env file.
+
+```console
+$ cat .env
+OPENAI_API_KEY=""
+PEBBLO_CLASSIFIER_URL="http://localhost:8000/"
+```
+> Note: You need to set `PEBBLO_CLASSIFIER_URL` only if your `Pebblo Server` is running somewhere other than the default URL of `http://localhost:8000`.
+
+4. Run langchain sample app Pebblo SafeLoader and Pebblo SafeRetriever
+
+```console
+$ python3 pebblo_identity_rag.py
+```
+
+5. Retrieve the Pebblo PDF report in `$HOME/.pebblo/pebblo-identity-rag-1/pebblo_report.pdf` file path on the system where `Pebblo Server` is running.
diff --git a/pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag.py b/pebblo_saferetriever/langchain/identity-rag/pebblo_identity_rag.py
@@ -0,0 +1,82 @@
+from typing import List
+
+# Fill-in OPENAI_API_KEY in .env file
+# in this directory before proceeding
+from dotenv import load_dotenv
+from langchain.chains import PebbloRetrievalQA
+from langchain.schema import Document
+from langchain_community.document_loaders import (
+    GoogleDriveLoader,
+    UnstructuredFileIOLoader,
+)
+from langchain_community.document_loaders.pebblo import PebbloSafeLoader
+from langchain_community.vectorstores import Chroma
+from langchain_community.vectorstores.utils import filter_complex_metadata
+from langchain_openai.embeddings import OpenAIEmbeddings
+from langchain_openai.llms import OpenAI
+
+load_dotenv()
+
+
+class PebbloIdentityRAG:
+    def __init__(self, folder_id: str):
+        self.app_name = "pebblo-identity-rag-1"
+
+        # Load documents
+        print("Loading RAG documents ...")
+        self.loader = PebbloSafeLoader(
+            GoogleDriveLoader(
+                folder_id=folder_id,
+                token_path="./google_token.json",
+                recursive=True,
+                file_loader_cls=UnstructuredFileIOLoader,
+                file_loader_kwargs={"mode": "elements"},
+                load_auth=True,
+            ),
+            name=self.app_name,  # App name (Mandatory)
+            owner="Joe Smith",  # Owner (Optional)
+            description="Identity enabled SafeLoader and SafeRetrival app using Pebblo",  # Description (Optional)
+        )
+        self.documents = self.loader.load()
+        print(self.documents[-1].metadata.get("authorized_identities"))
+        self.filtered_docs = filter_complex_metadata(self.documents)
+        print(f"Loaded {len(self.documents)} documents ...\n")
+
+        # Load documents into VectorDB
+
+        print("Hydrating Vector DB ...")
+        self.vectordb = self.embeddings(self.filtered_docs)
+        print("Finished hydrating Vector DB ...\n")
+
+        # Prepare LLM
+        self.llm = OpenAI()
+
+    @staticmethod
+    def embeddings(docs: List[Document]):
+        embeddings = OpenAIEmbeddings()
+        vectordb = Chroma.from_documents(docs, embeddings)
+        return vectordb
+
+    def ask(self, question: str, auth_identifiers: dict):
+        # Prepare retriever QA chain
+        auth = {"$in": auth_identifiers}
+        retriever = PebbloRetrievalQA.from_chain_type(
+            llm=self.llm,
+            chain_type="stuff",
+            retriever=self.vectordb.as_retriever(),
+            verbose=True,
+            auth_context=auth,
+        )
+        return retriever.invoke(question)
+
+
+if __name__ == "__main__":
+    # TODO: pass the actual GoogleDrive folder id
+    # folder_id = "1sd0RqMMJKidf9Pb4YRCI2-NH4Udj885k"
+    folder_id = ""
+    rag_app = PebbloIdentityRAG(folder_id)
+    prompt = "What is adaptive pacing system?"
+    print(f"Query:\n{prompt}")
+    auth_context = {"authorized_identities": ["joe@acme.io", "sam@acme.io"]}
+    response = rag_app.ask(prompt, auth_context)
+    print(f"Response:\n{response}")
diff --git a/pebblo_saferetriever/langchain/identity-rag/requirements.txt b/pebblo_saferetriever/langchain/identity-rag/requirements.txt
@@ -0,0 +1,9 @@
+langchain>=0.1.7
+langchain-community
+langchain-openai
+chromadb==0.4.7
+python-dotenv==1.0.0
+tiktoken
+requests==2.31.0
+Markdown==3.5
+unstructured[all-docs]
diff --git a/pyproject.toml b/pyproject.toml
@@ -163,7 +163,7 @@ package-data = {"pebblo" = ['reports/templates/*', 'reports/assets/*','app/pebbl
 include-package-data = false
 
 [tool.setuptools.packages.find]
-exclude=["tests*", "samples*"]
+exclude=["tests*", "pebblo_safeloader*", "pebblo_saferetriever*"]
 
 [build-system]
 # These are the assumed default build requirements from pip:
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Overview

		`Pebblo SafeLoader` provides visibility and enforcement for Semantic, Entity, and Identity of data ingested into RAG applications.
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		# Overview

		`Pebblo SafeRetriever` provides visibility and enforcement for Semantic, Entity, and Identity of prompts and inference response for RAG applications.