-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathembedScript.py
81 lines (71 loc) · 2.31 KB
/
embedScript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from dotenv import load_dotenv
import os
from pinecone import Pinecone, ServerlessSpec
from openai import OpenAI
import json
import time
# Load environment variables
load_dotenv(dotenv_path='.env.local')
# Debugging: Print environment variables to confirm they are loaded
print(f"PINECONE_API_KEY: {os.getenv('PINECONE_API_KEY')}")
print(f"OPENAI_API_KEY: {os.getenv('OPENAI_API_KEY')}")
# Initialize Pinecone
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = "ram-index"
dimension = 1536
# Create the Pinecone index if it doesn't exist
if index_name not in pc.list_indexes().names():
pc.create_index(
name=index_name,
dimension=dimension,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
print(f"Index '{index_name}' created successfully.")
# Load the review data
data_path = './data.json'
try:
with open(data_path, 'r') as file:
data = json.load(file)
except Exception as e:
print(f"Error reading or parsing data.json: {e}")
exit(1)
processed_data = []
client = OpenAI(
api_key=os.getenv("OPENAI_API_KEY"),
)
# Create embeddings for each review
for review in data:
print(f"Processing review ID: {review['id']}")
try:
response = client.embeddings.create(
input=review['review'],
model="text-embedding-3-small"
)
embedding = response.data[0].embedding
processed_data.append(
{
"values": embedding,
"id": review["id"],
"metadata": {
"review": review["review"],
"mechanicName": review["mechanicName"],
"shopName": review["shopName"],
"rating": review["rating"],
"specialty": review["specialty"],
"location": review["location"]
}
}
)
except Exception as e:
print(f"Error creating embedding for review {review['id']}: {e}")
time.sleep(1) # Add a 1-second delay between requests
# Insert the embeddings into the Pinecone index
index = pc.Index(index_name)
upsert_response = index.upsert(
vectors=processed_data,
namespace="ns1",
)
print(f"Upserted count: {upsert_response['upserted_count']}")
# Print index statistics
print(index.describe_index_stats())