Skip to content

Commit

Permalink
prompt edging
Browse files Browse the repository at this point in the history
  • Loading branch information
LIZARD-OFFICIAL-77 committed Oct 28, 2024
1 parent 0177aa3 commit c748909
Show file tree
Hide file tree
Showing 2 changed files with 39 additions and 64 deletions.
37 changes: 25 additions & 12 deletions spicejack/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@

from itertools import islice # i fr dont know, this is needed for grouping sentences into chunks of 10


MESSAGE_CONTEXT_SIZE = 10

def read_pdf(path):
rsrcmgr = PDFResourceManager()
retstr = StringIO()
Expand Down Expand Up @@ -160,13 +163,14 @@ def __init__(self,filepath,filters: list = None,use_legitimate=False,model="gpt-
self.fp = filepath
self.chatbot = G4FChatbot(model) if not use_legitimate else OpenAIChatbot(model)
self.chatbot.instructions(prompt1)
def run(self,*,thread=False,process=False,logging=False):
def run(self,*,thread=False,process=False,logging=False,autosave=False):
"""Process PDF file.
Args:
thread (bool, optional): Run in a child process. Defaults to False.
process (bool, optional): Run in a child thread. Defaults to False.
logging (bool, optional): Print the responses from the LLM. Defaults to False.
autosave (bool, optional): Save q&a pairs as soon as they are processed
"""

self.sent_list = split_into_sentences(read_pdf(self.fp))
Expand All @@ -179,27 +183,38 @@ def run(self,*,thread=False,process=False,logging=False):
remove_non_ascii
)


self.autosave = autosave
self.logging = logging
if thread:
self.thread = Thread(target=self.run)
self.thread = Thread(target=self.run,kwargs={
"logging":logging,
"autosave":autosave
})
self.thread.start()
return
if process:
self.process = Process(target=self.run)
self.process = Process(target=self.run,kwargs={
"logging":logging,
"autosave":autosave
})
self.process.start()
return

for sent in self.grouper(self.sent_list,10):
for sent in self.grouper(self.sent_list,MESSAGE_CONTEXT_SIZE):
try:
response = self.chatbot.message(" ".join(sent)).strip("```json").strip("```")
if self.logging:print(response)
response_json = json.loads(response)
if not response_json == {}:
self.result += response_json # convert response from AI to a python list.
if not response_json == []:
self.add(response_json) # convert response from AI to a python list.
if self.autosave:
self.save()

except json.JSONDecodeError:continue

return self.result
def add(self,pairs):
for i in pairs: self.result.append(i)
def stop(self):
if hasattr(self,"thread"):
self.thread.stop()
Expand All @@ -208,22 +223,20 @@ def stop(self):
else:
raise RuntimeError("No child process or child thread found.")

def save(self,jsonpath):
def save(self,jsonpath="result.json"):
"""Save the result into json file
Args:
jsonpath (str): Path to save the json file.
"""
with open(jsonpath,"w") as file:
json.dump(self.result,file)
json.dump(self.result,file,indent=4)
def grouper(self,iterable, size):
it = iter(iterable)
item = list(islice(it, size))
while item:
yield item
item = list(islice(it, size))

if __name__ == "__main__":
processor = PDFprocessor("/home/lizard/Projects/SpiceJack/development/tests/Natural_language_processing.pdf")
processor.run(logging=True)


66 changes: 14 additions & 52 deletions spicejack/prompt.py
Original file line number Diff line number Diff line change
@@ -1,64 +1,26 @@
prompt1 = """You will receive a chunk of text extracted from a document. Based on this text, generate a list of relevant questions and answers in JSON format. The questions should be designed to extract key information from the text, and the answers should be concise yet complete.
prompt1 = """
You are a JSON question and answer generator. Your task is to create a JSON object containing questions and answers based solely on the provided text.
The output should follow this format:
```json
[
{"question": "question here", "answer": "answer here"},
{"question": "another question here", "answer": "another answer here"}
]
```
Please ensure:
1. That the questions cover the main points and important details.
2. That the answers are accurate and directly drawn from the provided content.
3. That you do not respond to chunks of text that do not make sense, or are too short, in that case return "{}"
The JSON returned must be fully independent from the document.
JSON list of highly encouraged good awesome things:
Output Format:
```json
[
{
"question": "What is the structure of the Bitcoin network?",
"answer": "The network requires minimal structure, with messages broadcast on a best effort basis and nodes capable of leaving and rejoining the network at will."
},
{
"question": "Describe the structure of the Bitcoin network.",
"answer": "The Bitcoin network requires minimal structure, with messages being broadcast on a best effort basis and nodes having the ability to leave and rejoin the network at will."
},
{
"question": "Who is the author of the Bitcoin paper?",
"answer": "Satoshi Nakamoto"
},
{
"question": "What is the main goal of Bitcoin according to the abstract?",
"answer": "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution."
[
{
"question": "What is [something about the subject] in [topic/subject]?",
"answer": "[Answer the question]"
},
{
"question": "What is [topic/subject]?",
"answer": "[Describe the topic/subject in a broad sense]"
},
]
}
]
```
JSON list of unacceptably bad things:
```json
[
{
"question": "What is the proposed solution to the double-spending problem described in the text?",
"answer": "The proposed solution is using a peer-to-peer network to timestamp transactions by hashing them into a chain of hash-based proof-of-work."
},
{
"question": "What is the proposed solution to the double-spending problem described in the text?",
"answer": "The proposed solution is using a peer-to-peer network to timestamp transactions by hashing them into a chain of hash-based proof-of-work."
},
{
"question": "What does the longest chain in the network serve as proof of?",
"answer": "The longest chain serves as proof of the sequence of events witnessed and the fact it came from the largest pool of CPU power."
},
{
"question": "Why would nodes accept the longest proof-of-work chain as proof of what happened?",
"answer": "As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers."
}]
```
Do not refer to THE TEXT. Make the questions and answers independent from it. Question and answers should not contain "the text".
"""
If there is any issue with processing the request, return `[]`."""

if __name__ == "__main__":
print(prompt1)

0 comments on commit c748909

Please sign in to comment.