From e1459b4ee5524011043721db1090be42f66cb467 Mon Sep 17 00:00:00 2001 From: Rajendra Kadam Date: Wed, 18 Sep 2024 11:57:24 +0530 Subject: [PATCH 1/2] Samples: text loader sample using PebbloTextLoader --- .../textloader_postgress/.env.sample | 11 +++ .../textloader_postgress/pebblo_safeload.py | 98 +++++++++++++++++++ .../textloader_postgress/requirements.txt | 8 ++ .../langchain/textloader_postgress/util.py | 54 ++++++++++ 4 files changed, 171 insertions(+) create mode 100644 pebblo_safeloader/langchain/textloader_postgress/.env.sample create mode 100644 pebblo_safeloader/langchain/textloader_postgress/pebblo_safeload.py create mode 100644 pebblo_safeloader/langchain/textloader_postgress/requirements.txt create mode 100644 pebblo_safeloader/langchain/textloader_postgress/util.py diff --git a/pebblo_safeloader/langchain/textloader_postgress/.env.sample b/pebblo_safeloader/langchain/textloader_postgress/.env.sample new file mode 100644 index 00000000..9cbefec8 --- /dev/null +++ b/pebblo_safeloader/langchain/textloader_postgress/.env.sample @@ -0,0 +1,11 @@ +# OpenAI credentials +OPENAI_API_KEY= + +# Pebblo configuration +PEBBLO_CLOUD_URL= +PEBBLO_API_KEY= +PEBBLO_CLASSIFIER_URL="http://localhost:8000/" + +# Postgres configuration +PG_CONNECTION_STRING = "postgresql://:@:/" + diff --git a/pebblo_safeloader/langchain/textloader_postgress/pebblo_safeload.py b/pebblo_safeloader/langchain/textloader_postgress/pebblo_safeload.py new file mode 100644 index 00000000..b2eea758 --- /dev/null +++ b/pebblo_safeloader/langchain/textloader_postgress/pebblo_safeload.py @@ -0,0 +1,98 @@ +import logging +import os + +from dotenv import load_dotenv +from langchain_community.document_loaders.pebblo import ( + PebbloSafeLoader, + PebbloTextLoader, +) +from langchain_openai.embeddings import OpenAIEmbeddings +from langchain_postgres import PGVector +from util import get_data + +load_dotenv() + +PEBBLO_API_KEY = os.getenv("PEBBLO_API_KEY") +PEBBLO_CLOUD_URL = os.getenv("PEBBLO_CLOUD_URL") +PG_CONNECTION_STRING = os.getenv("PG_CONNECTION_STRING") + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + + +class PebbloSafeRAG: + """ + Sample app to demonstrate the usage of PebbloSafeLoader + using PebbloTextLoader and PostgreSQL VectorDB + """ + + def __init__(self, collection_name: str): + self.loader_app_name = "pebblo-safe-loader-text-loader" + self.collection_name = collection_name + + print(120 * "-") + # Load documents + print("Loading RAG documents ...") + texts, metadata, metadatas, ids = get_data( + metadata=True, ids=True, metadatas=True + ) + self.loader = PebbloSafeLoader( + PebbloTextLoader( + texts=texts, + metadata=metadata, + metadatas=metadatas, + ids=ids, + ), + name=self.loader_app_name, # App name (Mandatory) + owner="Joe Smith", # Owner (Optional) + description="Identity & Semantic enabled SafeLoader app using Pebblo", # Description (Optional) + load_semantic=True, + api_key=PEBBLO_API_KEY, + ) + self.documents = self.loader.load() + unique_identities = set() + unique_topics = set() + unique_entities = set() + + for doc in self.documents: + if doc.metadata.get("authorized_identities"): + unique_identities.update(doc.metadata.get("authorized_identities")) + if doc.metadata.get("pebblo_semantic_topics"): + unique_topics.update(doc.metadata.get("pebblo_semantic_topics")) + if doc.metadata.get("pebblo_semantic_entities"): + unique_entities.update(doc.metadata.get("pebblo_semantic_entities")) + + print(f"Loaded {len(self.documents)} documents with the following metadata:") + print(f"Authorized Identities: {list(unique_identities)}") + print(f"Semantic Topics: {list(unique_topics)}") + print(f"Semantic Entities: {list(unique_entities)}") + print(120 * "-") + + # Load documents into VectorDB + print("Hydrating Vector DB ...") + self.vectordb = self.init_vector_db() + print("Finished hydrating Vector DB ...\n") + print(120 * "-") + + def init_vector_db(self): + """ + Initialize PostgreSQL VectorDB from documents + """ + embeddings = OpenAIEmbeddings() + vectordb = PGVector.from_documents( + embedding=embeddings, + documents=self.documents, + collection_name=self.collection_name, + connection=PG_CONNECTION_STRING, + pre_delete_collection=True, + use_jsonb=True, + ) + print(f"Added {len(self.documents)} documents to PostgreSQL ...\n") + return vectordb + + +if __name__ == "__main__": + input_collection_name = "identity-enabled-text-loader" + rag_app = PebbloSafeRAG( + collection_name=input_collection_name, + ) diff --git a/pebblo_safeloader/langchain/textloader_postgress/requirements.txt b/pebblo_safeloader/langchain/textloader_postgress/requirements.txt new file mode 100644 index 00000000..c263668b --- /dev/null +++ b/pebblo_safeloader/langchain/textloader_postgress/requirements.txt @@ -0,0 +1,8 @@ +python-dotenv==1.0.0 +tiktoken # OpenAI tokenizer + +langchain-openai>=0.1.7 # For OpenAI LLM and OpenAIEmbeddings +langchain-community>=0.2.16,<0.3 # for PebbloSafeLoader, PebbloRetrievalQA + +psycopg2-binary # For Postgres VectorStore +langchain-postgres # For Postgres VectorStore \ No newline at end of file diff --git a/pebblo_safeloader/langchain/textloader_postgress/util.py b/pebblo_safeloader/langchain/textloader_postgress/util.py new file mode 100644 index 00000000..ccc1d30e --- /dev/null +++ b/pebblo_safeloader/langchain/textloader_postgress/util.py @@ -0,0 +1,54 @@ +from hashlib import sha256 +from typing import Any, Optional + + +def get_data( + metadata=False, ids=False, metadatas=False +) -> tuple[ + list[str], + Optional[dict[str, Any]], + Optional[list[dict[str, Any]]], + list[str], +]: + """ + Get data for PebbloTextLoader + + Args: + metadata: Include metadata for all texts. + Optional. Defaults to False. + ids: Include unique ids for each text. + Optional. Defaults to False. + metadatas: Include metadata for each text. + Optional. Defaults to False. + + Returns: + tuple: A tuple containing texts, metadata, metadatas, and ids. + """ + + texts = [ + "Wipros board on Friday, January 12 announced an interim dividend of Re 1 per equity share of the face value of Rs 2 each, i.e., a 50 per cent payout for the current financial year along with financial results for the October-December period of the company for the financial year ending March 2024.", + "Roberts reminded the board of the scheduled retreat coming up in three months, and provided a drafted retreat schedule. The board provided feedback on the agenda and the consensus was that, outside of making a few minor changes, the committee should move forward as planned. No board action required.", + "Claims: An adaptive pacing system for implantable cardiac devices, comprising a pulse generator, multiple sensing electrodes, a microprocessor-based control unit, a wireless communication module, and memory for dynamically adjusting pacing parameters based on real-time physiological data. The system of claim 1, wherein the adaptive pacing algorithms include rate-responsive pacing based on physical activity. The system of claim 1, further comprising an external monitoring system for remote data access and modification of pacing parameters.", + "Sachin's SSN is 222-85-4836. His passport ID is 5484880UA. Sachin's driver's license number is S9998888. Sachin's bank account number is 70048841700216300. His American express credit card number is 371449635398431. His UK IBAN Code is AZ96AZEJ00000000001234567890. ITIN number 993-77 0690.", + ] + + if metadata: + _metadata = {"authorized_identities": ["joe.smith@acme.org"]} + else: + _metadata = None + + if metadatas: + # Metadata(source: fake news web url) for each text + _metadata_list = [ + {"source": f"https://www.acme.org/news/{i}"} + for i in range(1, len(texts) + 1) + ] + else: + _metadata_list = None + + if ids: + # Unique ids for each text (sha256 hash of text) + _ids = [sha256(text.encode()).hexdigest() for text in texts] + else: + _ids = None + return texts, _metadata, _metadata_list, _ids From 291d09aec4667fc8b2f0ac21f1aed5025d79f87a Mon Sep 17 00:00:00 2001 From: Rajendra Kadam Date: Wed, 18 Sep 2024 12:13:19 +0530 Subject: [PATCH 2/2] Fix lint --- pebblo_safeloader/langchain/textloader_postgress/util.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pebblo_safeloader/langchain/textloader_postgress/util.py b/pebblo_safeloader/langchain/textloader_postgress/util.py index ccc1d30e..c082cd04 100644 --- a/pebblo_safeloader/langchain/textloader_postgress/util.py +++ b/pebblo_safeloader/langchain/textloader_postgress/util.py @@ -3,7 +3,7 @@ def get_data( - metadata=False, ids=False, metadatas=False + metadata=False, ids=False, metadatas=False ) -> tuple[ list[str], Optional[dict[str, Any]],