-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathingest_pdf_pinecone.py
59 lines (50 loc) · 1.79 KB
/
ingest_pdf_pinecone.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""Load html from files, clean up, split, ingest into Weaviate."""
import pickle
from langchain.document_loaders import PyMuPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Pinecone
from dotenv import load_dotenv
from pathlib import Path
import sys
import os
import argparse
import pinecone
if getattr(sys, 'frozen', False):
script_location = Path(sys.executable).parent.resolve()
else:
script_location = Path(__file__).parent.resolve()
load_dotenv(dotenv_path=script_location / '.env')
parser = argparse.ArgumentParser(description='Ingest data.')
parser.add_argument('-f', '--fileName',
help="Data file's name, path")
parser.add_argument('-i', '--index_name',
help="Index's name")
args = parser.parse_args()
fileNmae = args.fileName
index_name = args.index_name
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENV = os.getenv("PINECONE_ENV")
# initialize pinecone
pinecone.init(
api_key=PINECONE_API_KEY, # find at app.pinecone.io
environment=PINECONE_ENV # next to api key in console
)
def ingest_docs():
"""Get documents from web pages."""
loader = PyMuPDFLoader(fileNmae)
raw_documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
)
documents = text_splitter.split_documents(raw_documents)
embeddings = OpenAIEmbeddings(model="gpt-4")
# db = Pinecone.from_documents(
# documents=documents, embedding=embeddings, index_name=index_name)
db = Pinecone.from_existing_index(index_name, embeddings)
db.add_documents(documents=documents)
if db == None:
print("None")
if __name__ == "__main__":
ingest_docs()