-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add DocuSense, refactor, update tests and README.
- Loading branch information
Showing
7 changed files
with
236 additions
and
33 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
#!/usr/bin/env python3 | ||
""" | ||
This script provides functionality for summarizing a given document using OpenAI's GPT-3.5-turbo model. | ||
It includes the 'init()' function to initialize environment variables, the 'summarize()' function to generate summaries, | ||
and the 'docusense()' function as the entry point for the script. The 'docusense()' function takes command-line arguments | ||
for the document path, chunk size, and chunk overlap. It utilizes prompts and chains to perform the summarization process. | ||
""" | ||
|
||
|
||
def init(): | ||
""" | ||
Initializes the environment variables by loading the .env file. | ||
Returns: | ||
None | ||
""" | ||
from dotenv import load_dotenv, find_dotenv | ||
|
||
load_dotenv(find_dotenv(), override=True) | ||
|
||
|
||
def summarize( | ||
document: str, | ||
summary_file: str, | ||
chunk_size: int, | ||
chunk_overlap: int, | ||
max_single_shot_num_tokens: int = 2048, | ||
) -> None: | ||
""" | ||
Summarizes a given document using OpenAI's GPT-3.5-turbo model. | ||
Args: | ||
document (str): The path to the document to be summarized. | ||
chunk_size (int): The size of each chunk of the document to be summarized. | ||
chunk_overlap (int): The amount of overlap between each chunk of the document. | ||
max_single_shot_num_tokens (int, optional): The maximum number of tokens allowed for a single-shot summarization. Defaults to 2048. | ||
Returns: | ||
None | ||
Raises: | ||
FileNotFoundError: If the specified document path does not exist. | ||
""" | ||
from langchain.chat_models import ChatOpenAI | ||
from langchain import PromptTemplate | ||
from langchain.chains import LLMChain | ||
from langchain.chains.summarize import load_summarize_chain | ||
from document_loaders.document_loaders import ( | ||
load_document, | ||
merge_document, | ||
chunk_data, | ||
) | ||
from text_utils.text_utils import num_tokens_and_cost | ||
|
||
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo") | ||
|
||
map_prompt = """ | ||
Write a concise summary of the following: | ||
Text: `{text}` | ||
CONCISE SUMMARY: | ||
""" | ||
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"]) | ||
combine_prompt = """ | ||
Write a concise summary of the following text that covers key points. | ||
Add a title to the summary. | ||
Start the summary with an INTRODUCTION PARAGRAPH that gives an overview of the topic FOLLOWED | ||
by BULLET POINTS if possible AND end the summary with a CONCLUSION. | ||
Text: `{text}` | ||
""" | ||
combine_prompt_template = PromptTemplate( | ||
template=combine_prompt, input_variables=["text"] | ||
) | ||
|
||
doc = load_document(document) | ||
num_tokens, cost = num_tokens_and_cost(doc) | ||
print(f"Approximate summarization cost: ${cost:.4f}") | ||
if num_tokens <= max_single_shot_num_tokens: | ||
chain = LLMChain(llm=llm, prompt=combine_prompt_template) | ||
print("Running single-shot summarization") | ||
summary = chain.run({"text": merge_document(doc)}) | ||
print(f"Writing summary to {summary_file}... ", end="") | ||
with open(summary_file, "w") as f: | ||
f.write(summary) | ||
print("Done") | ||
else: | ||
chain = load_summarize_chain( | ||
llm=llm, | ||
chain_type="map_reduce", | ||
map_prompt=map_prompt_template, | ||
combine_prompt=combine_prompt_template, | ||
) | ||
print("Running multi-shot summarization") | ||
chain = load_summarize_chain( | ||
llm=llm, | ||
chain_type="map_reduce", | ||
map_prompt=map_prompt_template, | ||
combine_prompt=combine_prompt_template, | ||
) | ||
summary = chain.run( | ||
chunk_data(data=doc, chunk_size=chunk_size, chunk_overlap=chunk_overlap) | ||
) | ||
print(f"Writing summary to {summary_file}... ", end="") | ||
with open(summary_file, "w") as f: | ||
f.write(summary) | ||
print("Done") | ||
|
||
|
||
def docusense() -> None: | ||
""" | ||
This function takes in a document path and summary file path and summarizes it using DocuSense. | ||
It also takes in optional arguments for chunk size and overlap. | ||
Returns: | ||
None | ||
""" | ||
import argparse | ||
|
||
parser = argparse.ArgumentParser(description="DocuSense") | ||
parser.add_argument( | ||
"document", type=str, help="Path to the document to be summarized." | ||
) | ||
parser.add_argument( | ||
"summary_file", | ||
type=str, | ||
help="Path to the file where summary will be written to.", | ||
) | ||
parser.add_argument( | ||
"--chunk_size", type=int, default=3300, help="Chunk size in tokens." | ||
) | ||
parser.add_argument( | ||
"--chunk_overlap", type=int, default=100, help="Chunk overlap in tokens." | ||
) | ||
args = parser.parse_args() | ||
document = args.document | ||
summary_file = args.summary_file | ||
chunk_size = args.chunk_size | ||
chunk_overlap = args.chunk_overlap | ||
print(f"Instantiating DocuSense for {document}") | ||
init() | ||
try: | ||
summarize(document, summary_file, chunk_size, chunk_overlap) | ||
except FileNotFoundError: | ||
print(f"File {document} not found") | ||
|
||
|
||
if __name__ == "__main__": | ||
docusense() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.