-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
92 lines (77 loc) · 3.54 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import os
from pymongo import MongoClient
from langchain.document_loaders.json_loader import JSONLoader
from sentence_transformers import SentenceTransformer
import json
# -----------------------------
# Step 1: Load Financial Chatbot Dataset
# -----------------------------
# Define dataset file path
dataset_file = './financial_dataset_chatbot.json'
if not os.path.isfile(dataset_file):
raise FileNotFoundError(f"Dataset file not found at {dataset_file}. Please ensure the dataset exists.")
print(f"Dataset found at {dataset_file}")
# -----------------------------
# Step 2: Load Documents
# -----------------------------
# Load the dataset using LangChain's JSONLoader
print("Loading documents from dataset...")
loader = JSONLoader(file_path=dataset_file, jq_schema=".", text_content=False, json_lines=False)
docs = loader.load()
print(f"Loaded {len(docs)} documents successfully.")
# -----------------------------
# Step 3: Initialize Sentence Transformer Embedding Model
# -----------------------------
# Define embedding model details
model_path = "ibm-granite/granite-embedding-125m-english"
# Load the Sentence Transformer model
print("Initializing Sentence Transformer Embedding Model...")
model = SentenceTransformer(model_path)
print("Embedding model initialized successfully.")
# -----------------------------
# Step 4: Initialize MongoDB Atlas Vector Store
# -----------------------------
# Define MongoDB connection details
MONGO_CONN = "<MONGODB-CONNECTION-STRING>"
# Establish a MongoDB connection
print("Connecting to MongoDB Atlas...")
client = MongoClient(
MONGO_CONN,
tls=True,
tlsAllowInvalidCertificates=True
)
# Define MongoDB collections for vector stores
faq_collection = client["banking_quickstart"]["faqs"]
customer_collection = client["banking_quickstart"]["customers_details"]
transaction_collection = client["banking_quickstart"]["transactions_details"]
spending_insight_collection = client["banking_quickstart"]["spending_insight_details"]
# -----------------------------
# Step 5: Load Documents into Vector Store
# -----------------------------
# Define a helper function to process and store documents
def add_documents_to_vector_store(collection, data, key_field):
for item in data:
try:
content = item[key_field] # Extract the key field for embeddings
metadata = {k: v for k, v in item.items() if k != key_field}
embedding = model.encode(content).tolist() # Generate embeddings using SentenceTransformer
# Store the document and its embedding in MongoDB
collection.insert_one({
"customer_id": content,
"embedding": embedding,
"metadata": metadata
})
except Exception as e:
print(f"Error processing item: {item} - Error: {e}")
# Load each section of the dataset into its respective MongoDB collection
with open(dataset_file, "r") as file:
dataset = json.load(file)
print("Loading FAQs into vector store...")
add_documents_to_vector_store(faq_collection, dataset["faqs"], "question")
print("Loading Customers into vector store...")
add_documents_to_vector_store(customer_collection, dataset["customers"], "customer_id")
print("Loading Transactions into vector store...")
add_documents_to_vector_store(transaction_collection, dataset["transactions"], "customer_id")
print("Loading Spending Insights into vector store...")
add_documents_to_vector_store(spending_insight_collection, dataset["spending_insights"], "customer_id")
print("All documents successfully added to MongoDB Atlas with embeddings.")