-
Notifications
You must be signed in to change notification settings - Fork 0
/
summarize.py
88 lines (73 loc) · 2.35 KB
/
summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain import OpenAI
from langchain.chains.summarize import load_summarize_chain
from time import time
import argparse
# Parse arguments (-h help is automatically generated)
parser = argparse.ArgumentParser()
parser.add_argument(
"-f",
"--file_path",
type=str,
help="Path to the PDF file to summarize",
required=True,
)
parser.add_argument(
"-c",
"--chain_type",
type=str,
help='Open ai summary chain_type: "stuff", "map_reduce" or "refine"',
default="map_reduce",
)
parser.add_argument(
"-s",
"--chunk_size",
type=int,
help="Size of the chunks to split the PDF into",
default=750,
)
parser.add_argument(
"-o",
"--chunk_overlap",
help="Amount of overlap between split chunks of PDF",
default=100,
)
args = parser.parse_args()
def summarize(pdf_path, chain_type, chunk_size, chunk_overlap) -> str:
"""Summarize a pdf using openai
Takes a
Required File path to a pdf,
Optional chain_type (strategy to call openai to generate summary)
Optional chunk_size (size of the chunks to split the pdf into)
Optional chunk_overlap (amount of overlap between chunks)
Returns a summary of the pdf from openai
"""
# Load the PDF from file_path
print(f"Loading PDF from path: {pdf_path}")
loader = PyPDFLoader(file_path=pdf_path)
# Split the PDF into chunks of text
text_splitter = CharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=chunk_overlap, separator="/n"
)
document = loader.load()
docs = text_splitter.split_documents(documents=document)
# Summarize throught openai, use the chain_type specified
llm = OpenAI(temperature=0)
chain = load_summarize_chain(llm, chain_type=chain_type)
openai_response = chain.run(docs)
# todo: store the summary in a file
return openai_response
if __name__ == "__main__":
start = (
time()
) # timing the method, because certain chain_types take a super long time
print(
summarize(
pdf_path=args.file_path,
chain_type=args.chain_type,
chunk_overlap=args.chunk_overlap,
chunk_size=args.chunk_size,
)
)
print(f"Time to summarize using chain_type = {args.chain_type}: {time() - start}")