prompt edging

LIZARD-OFFICIAL-77 · Oct 28, 2024 · c748909 · c748909
1 parent 0177aa3
commit c748909
Show file tree

Hide file tree

Showing 2 changed files with 39 additions and 64 deletions.
diff --git a/spicejack/pdf.py b/spicejack/pdf.py
@@ -42,6 +42,9 @@
 
 from itertools import islice # i fr dont know, this is needed for grouping sentences into chunks of 10
 
+
+MESSAGE_CONTEXT_SIZE = 10
+
 def read_pdf(path):
     rsrcmgr = PDFResourceManager()
     retstr = StringIO()
@@ -160,13 +163,14 @@ def __init__(self,filepath,filters: list = None,use_legitimate=False,model="gpt-
         self.fp = filepath
         self.chatbot = G4FChatbot(model) if not use_legitimate else OpenAIChatbot(model)
         self.chatbot.instructions(prompt1)
-    def run(self,*,thread=False,process=False,logging=False):
+    def run(self,*,thread=False,process=False,logging=False,autosave=False):
         """Process PDF file.
 
         Args:
             thread (bool, optional): Run in a child process. Defaults to False.
             process (bool, optional): Run in a child thread. Defaults to False.
             logging (bool, optional): Print the responses from the LLM. Defaults to False.
+            autosave (bool, optional): Save q&a pairs as soon as they are processed
         """
 
         self.sent_list = split_into_sentences(read_pdf(self.fp))
@@ -179,27 +183,38 @@ def run(self,*,thread=False,process=False,logging=False):
             remove_non_ascii
         )
 
-
+        self.autosave = autosave
         self.logging = logging
         if thread:
-            self.thread = Thread(target=self.run)
+            self.thread = Thread(target=self.run,kwargs={
+                "logging":logging,
+                "autosave":autosave
+            })
             self.thread.start()
             return
         if process:
-            self.process = Process(target=self.run)
+            self.process = Process(target=self.run,kwargs={
+                "logging":logging,
+                "autosave":autosave
+            })
             self.process.start()
             return
 
-        for sent in self.grouper(self.sent_list,10):
+        for sent in self.grouper(self.sent_list,MESSAGE_CONTEXT_SIZE):
             try:
                 response = self.chatbot.message(" ".join(sent)).strip("```json").strip("```")
                 if self.logging:print(response)
                 response_json = json.loads(response)
-                if not response_json == {}:
-                    self.result += response_json  # convert response from AI to a python list.
+                if not response_json == []:
+                    self.add(response_json) # convert response from AI to a python list.
+                    if self.autosave:
+                        self.save()
+
             except json.JSONDecodeError:continue
 
         return self.result
+    def add(self,pairs):
+        for i in pairs: self.result.append(i)
     def stop(self):
         if hasattr(self,"thread"):
             self.thread.stop()
@@ -208,22 +223,20 @@ def stop(self):
         else:
             raise RuntimeError("No child process or child thread found.")
 
-    def save(self,jsonpath):
+    def save(self,jsonpath="result.json"):
         """Save the result into json file
 
         Args:
             jsonpath (str): Path to save the json file.
         """
         with open(jsonpath,"w") as file:
-            json.dump(self.result,file)
+            json.dump(self.result,file,indent=4)
     def grouper(self,iterable, size):
         it = iter(iterable)
         item = list(islice(it, size))
         while item:
             yield item
             item = list(islice(it, size))
 
-if __name__ == "__main__":
-    processor = PDFprocessor("/home/lizard/Projects/SpiceJack/development/tests/Natural_language_processing.pdf")
-    processor.run(logging=True)
+
 
diff --git a/spicejack/prompt.py b/spicejack/prompt.py
@@ -1,64 +1,26 @@
-prompt1 = """You will receive a chunk of text extracted from a document. Based on this text, generate a list of relevant questions and answers in JSON format. The questions should be designed to extract key information from the text, and the answers should be concise yet complete.
+prompt1 = """
+You are a JSON question and answer generator. Your task is to create a JSON object containing questions and answers based solely on the provided text.
 
-The output should follow this format:
 
-```json
-[
-  {"question": "question here", "answer": "answer here"},
-  {"question": "another question here", "answer": "another answer here"}
-]
-```
 
-Please ensure:
-1. That the questions cover the main points and important details.
-2. That the answers are accurate and directly drawn from the provided content.
-3. That you do not respond to chunks of text that do not make sense, or are too short, in that case return "{}"
 
-The JSON returned must be fully independent from the document.
-JSON list of highly encouraged good awesome things: 
+Output Format:  
 ```json
-[
-{
-  "question": "What is the structure of the Bitcoin network?", 
-  "answer": "The network requires minimal structure, with messages broadcast on a best effort basis and nodes capable of leaving and rejoining the network at will."
-},
 {
-  "question": "Describe the structure of the Bitcoin network.",
-  "answer": "The Bitcoin network requires minimal structure, with messages being broadcast on a best effort basis and nodes having the ability to leave and rejoin the network at will."
-},
-{
-  "question": "Who is the author of the Bitcoin paper?", 
-  "answer": "Satoshi Nakamoto"
-},
-{
-  "question": "What is the main goal of Bitcoin according to the abstract?", 
-  "answer": "A purely peer-to-peer version of electronic cash would allow online payments to be sent directly from one party to another without going through a financial institution."
+  [
+    {
+      "question": "What is [something about the subject] in [topic/subject]?",
+      "answer": "[Answer the question]"
+    },
+    {
+      "question": "What is [topic/subject]?",
+      "answer": "[Describe the topic/subject in a broad sense]"
+    },
+  ]
 }
-]
-```
-JSON list of unacceptably bad things:
-```json
-[
-{
-  "question": "What is the proposed solution to the double-spending problem described in the text?",
-  "answer": "The proposed solution is using a peer-to-peer network to timestamp transactions by hashing them into a chain of hash-based proof-of-work."
-},
-{
-  "question": "What is the proposed solution to the double-spending problem described in the text?",
-  "answer": "The proposed solution is using a peer-to-peer network to timestamp transactions by hashing them into a chain of hash-based proof-of-work."
-},
-{
-  "question": "What does the longest chain in the network serve as proof of?",
-  "answer": "The longest chain serves as proof of the sequence of events witnessed and the fact it came from the largest pool of CPU power."
-},
-{
-  "question": "Why would nodes accept the longest proof-of-work chain as proof of what happened?", 
-  "answer": "As long as a majority of CPU power is controlled by nodes that are not cooperating to attack the network, they'll generate the longest chain and outpace attackers."
-}]
 ```
-Do not refer to THE TEXT. Make the questions and answers independent from it. Question and answers should not contain "the text".
-"""
 
+If there is any issue with processing the request, return `[]`."""
 
 if __name__ == "__main__":
     print(prompt1)