Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Samples: text loader sample using PebbloTextLoader #539

Merged
merged 2 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions pebblo_safeloader/langchain/textloader_postgress/.env.sample
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# OpenAI credentials
OPENAI_API_KEY=<YOUR OPENAI API KEY>

# Pebblo configuration
PEBBLO_CLOUD_URL=<PEBBLO CLOUD URL>
PEBBLO_API_KEY=<YOUR PEBBLO API KEY>
PEBBLO_CLASSIFIER_URL="http://localhost:8000/"

# Postgres configuration
PG_CONNECTION_STRING = "postgresql://<USERNAME>:<PASSWORD>@<HOST>:<PORT>/<DATABASE-NAME>"

Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import logging
import os

from dotenv import load_dotenv
from langchain_community.document_loaders.pebblo import (
PebbloSafeLoader,
PebbloTextLoader,
)
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_postgres import PGVector
from util import get_data

load_dotenv()

PEBBLO_API_KEY = os.getenv("PEBBLO_API_KEY")
PEBBLO_CLOUD_URL = os.getenv("PEBBLO_CLOUD_URL")
PG_CONNECTION_STRING = os.getenv("PG_CONNECTION_STRING")

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


class PebbloSafeRAG:
"""
Sample app to demonstrate the usage of PebbloSafeLoader
using PebbloTextLoader and PostgreSQL VectorDB
"""

def __init__(self, collection_name: str):
self.loader_app_name = "pebblo-safe-loader-text-loader"
self.collection_name = collection_name

print(120 * "-")
# Load documents
print("Loading RAG documents ...")
texts, metadata, metadatas, ids = get_data(
metadata=True, ids=True, metadatas=True
)
self.loader = PebbloSafeLoader(
PebbloTextLoader(
texts=texts,
metadata=metadata,
metadatas=metadatas,
ids=ids,
),
name=self.loader_app_name, # App name (Mandatory)
owner="Joe Smith", # Owner (Optional)
description="Identity & Semantic enabled SafeLoader app using Pebblo", # Description (Optional)
load_semantic=True,
api_key=PEBBLO_API_KEY,
)
self.documents = self.loader.load()
unique_identities = set()
unique_topics = set()
unique_entities = set()

for doc in self.documents:
if doc.metadata.get("authorized_identities"):
unique_identities.update(doc.metadata.get("authorized_identities"))
if doc.metadata.get("pebblo_semantic_topics"):
unique_topics.update(doc.metadata.get("pebblo_semantic_topics"))
if doc.metadata.get("pebblo_semantic_entities"):
unique_entities.update(doc.metadata.get("pebblo_semantic_entities"))

print(f"Loaded {len(self.documents)} documents with the following metadata:")
print(f"Authorized Identities: {list(unique_identities)}")
print(f"Semantic Topics: {list(unique_topics)}")
print(f"Semantic Entities: {list(unique_entities)}")
print(120 * "-")

# Load documents into VectorDB
print("Hydrating Vector DB ...")
self.vectordb = self.init_vector_db()
print("Finished hydrating Vector DB ...\n")
print(120 * "-")

def init_vector_db(self):
"""
Initialize PostgreSQL VectorDB from documents
"""
embeddings = OpenAIEmbeddings()
vectordb = PGVector.from_documents(
embedding=embeddings,
documents=self.documents,
collection_name=self.collection_name,
connection=PG_CONNECTION_STRING,
pre_delete_collection=True,
use_jsonb=True,
)
print(f"Added {len(self.documents)} documents to PostgreSQL ...\n")
return vectordb


if __name__ == "__main__":
input_collection_name = "identity-enabled-text-loader"
rag_app = PebbloSafeRAG(
collection_name=input_collection_name,
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
python-dotenv==1.0.0
tiktoken # OpenAI tokenizer

langchain-openai>=0.1.7 # For OpenAI LLM and OpenAIEmbeddings
langchain-community>=0.2.16,<0.3 # for PebbloSafeLoader, PebbloRetrievalQA

psycopg2-binary # For Postgres VectorStore
langchain-postgres # For Postgres VectorStore
54 changes: 54 additions & 0 deletions pebblo_safeloader/langchain/textloader_postgress/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from hashlib import sha256
from typing import Any, Optional


def get_data(
metadata=False, ids=False, metadatas=False
) -> tuple[
list[str],
Optional[dict[str, Any]],
Optional[list[dict[str, Any]]],
list[str],
]:
"""
Get data for PebbloTextLoader

Args:
metadata: Include metadata for all texts.
Optional. Defaults to False.
ids: Include unique ids for each text.
Optional. Defaults to False.
metadatas: Include metadata for each text.
Optional. Defaults to False.

Returns:
tuple: A tuple containing texts, metadata, metadatas, and ids.
"""

texts = [
"Wipros board on Friday, January 12 announced an interim dividend of Re 1 per equity share of the face value of Rs 2 each, i.e., a 50 per cent payout for the current financial year along with financial results for the October-December period of the company for the financial year ending March 2024.",
"Roberts reminded the board of the scheduled retreat coming up in three months, and provided a drafted retreat schedule. The board provided feedback on the agenda and the consensus was that, outside of making a few minor changes, the committee should move forward as planned. No board action required.",
"Claims: An adaptive pacing system for implantable cardiac devices, comprising a pulse generator, multiple sensing electrodes, a microprocessor-based control unit, a wireless communication module, and memory for dynamically adjusting pacing parameters based on real-time physiological data. The system of claim 1, wherein the adaptive pacing algorithms include rate-responsive pacing based on physical activity. The system of claim 1, further comprising an external monitoring system for remote data access and modification of pacing parameters.",
"Sachin's SSN is 222-85-4836. His passport ID is 5484880UA. Sachin's driver's license number is S9998888. Sachin's bank account number is 70048841700216300. His American express credit card number is 371449635398431. His UK IBAN Code is AZ96AZEJ00000000001234567890. ITIN number 993-77 0690.",
]

if metadata:
_metadata = {"authorized_identities": ["joe.smith@acme.org"]}
else:
_metadata = None

if metadatas:
# Metadata(source: fake news web url) for each text
_metadata_list = [
{"source": f"https://www.acme.org/news/{i}"}
for i in range(1, len(texts) + 1)
]
else:
_metadata_list = None

if ids:
# Unique ids for each text (sha256 hash of text)
_ids = [sha256(text.encode()).hexdigest() for text in texts]
else:
_ids = None
return texts, _metadata, _metadata_list, _ids