Bug fixes to default mac behavior, better audio quality on mac, bette…

…r recollection too use, more predictable completion detection assessments
elias-jhsph · Dec 11, 2023 · 26c7a65 · 26c7a65
1 parent f6e692e
commit 26c7a65
Show file tree

Hide file tree

Showing 10 changed files with 102 additions and 55 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "jarvis_conversationalist"
-version = "0.4.3"
+version = "0.4.4"
 authors = [{name="Elias Weston-Farber", email="eweston4@jhu.edu"}]
 description = "A voice assistant for the command line"
 readme = "README.md"

diff --git a/src/jarvis_conversationalist/__init__.py b/src/jarvis_conversationalist/__init__.py
@@ -2,5 +2,5 @@
 if sys.platform == 'linux':
     __import__('pysqlite3')
     sys.modules['sqlite3'] = sys.modules.pop('pysqlite3')
-__version__ = '0.4.3'
+__version__ = '0.4.4'
 # Path: src/jarvis_conversationalist/conversationalist.py
diff --git a/src/jarvis_conversationalist/__main__.py b/src/jarvis_conversationalist/__main__.py
@@ -52,6 +52,8 @@
 if os.path.exists(lock_unknown_speakers):
     os.remove(lock_unknown_speakers)
 
+print("\033[KSummarizing previous conversations... Please Wait...\033[K", end='\r')
+
 from .conversationalist import converse
 import warnings
 
@@ -81,10 +83,10 @@ def main():
         if args.no_speaker_detection:
             config['speakers'] = False
 
-    save_config(config, key)
     set_speakers_active(config.get('speakers', sys.platform != 'darwin'))
     set_openai_key(config.get('key', None))
     set_user(config.get('user', 'User'))
+    save_config(config, key)
 
     if get_openai_key() is None:
         print("Please set your OpenAI API key using the --key argument once to cache your key.")

diff --git a/src/jarvis_conversationalist/audio_player.py b/src/jarvis_conversationalist/audio_player.py
@@ -86,16 +86,16 @@ def _play_audio_file_blocking(file_path: str, stop_event: threading.Event, loops
         for loop in range(loops):
             if not stop_event.is_set() or (added_stop_event and not added_stop_event.is_set()):
                 data, fs = sf.read(file_path)
-                sd.play(data, fs)
-                while sd.get_stream().active:
-                    if stop_event.is_set() or (added_stop_event and added_stop_event.is_set()):
-                        sd.stop()
-                        break
-                    if added_stop_event:
-                        added_stop_event.wait(timeout=.02)
-                    else:
-                        stop_event.wait(timeout=.02)
+        sd.play(data, fs, latency=.25)
+        while sd.get_stream().active:
+            if stop_event.is_set() or (added_stop_event and added_stop_event.is_set()):
                 sd.stop()
+                break
+            if added_stop_event:
+                added_stop_event.wait(timeout=.02)
+            else:
+                stop_event.wait(timeout=.02)
+        sd.stop()
         # Destroy the file if needed
         if destroy:
             os.remove(file_path)

diff --git a/src/jarvis_conversationalist/config.py b/src/jarvis_conversationalist/config.py
@@ -13,7 +13,7 @@
 USER = "User"
 KEY = None
 file = {}
-if sys.platform == "darwin":
+if sys.platform != "darwin":
     SPEAKERS = True
 else:
     SPEAKERS = False

diff --git a/src/jarvis_conversationalist/conversationalist.py b/src/jarvis_conversationalist/conversationalist.py
@@ -10,7 +10,8 @@
 import importlib.resources as pkg_resources
 
 from .openai_utility_functions import check_for_directed_at_me, check_for_completion, extract_query
-from .openai_interface import stream_response, resolve_response, use_tools, schedule_refresh_assistant
+from .openai_interface import stream_response, resolve_response, use_tools, schedule_refresh_assistant, \
+    get_speaker_detection
 from .streaming_response_audio import stream_audio_response, set_rt_text_queue
 from .audio_player import play_audio_file
 from .audio_listener import audio_capture_process
@@ -201,7 +202,7 @@ def converse(memory, interrupt_event, start_event, stop_event):
                                 threading.Event().wait(0.3)
                     speaking.set()
                     beeps_stop_event = play_audio_file(core_path+"/beeps.wav", loops=7, blocking=False)
-                    extracted_query = extract_query(transcript)
+                    extracted_query = extract_query(transcript, speaker_detection=get_speaker_detection())
                     logger.info("Query extracted: " + extracted_query)
                     new_history = None
                     if not interrupt_event.is_set():

diff --git a/src/jarvis_conversationalist/openai_interface.py b/src/jarvis_conversationalist/openai_interface.py
@@ -52,7 +52,9 @@
 tools_list = get_function_list() + get_speaker_function_list()
 function_info = get_function_info()
 speaker_info = get_speaker_function_info()
+speaker_detection = False
 for speaker_info_key, speaker_info_value in speaker_info.items():
+    speaker_detection = True
     function_info[speaker_info_key] = speaker_info_value
 
 # Setup background task system
@@ -61,6 +63,17 @@
 atexit.register(executor.shutdown, wait=True)
 
 
+def get_speaker_detection():
+    """
+    Get whether speaker detection is enabled or not.
+
+    :return: Whether speaker detection is enabled or not.
+    :rtype: bool
+    """
+    global speaker_detection
+    return speaker_detection
+
+
 def summarizer(input_list):
     """
     Summarize a conversation by sending a query to the OpenAI API.
@@ -93,39 +106,40 @@ def summarizer(input_list):
     return {"role": "system", "content": output}
 
 
-def recollect(question="", search_query="", mode=""):
+def recollect(question="", query="", mode=""):
     """
     Search the conversation history for a query.
 
     :param question: The question to answer.
     :type question: str
-    :param search_query: The query to search for.
-    :type search_query: str
+    :param query: The query input.
+    :type query: str
     :param mode: The mode to search in.
-    Can be 'search_full', 'search_details', 'similarity_full', or 'similarity_details'.
+    Can be 'search_exact_text_full', 'search_exact_text_summaries', 'vector_similarity_full', or
+    'vector_similarity_summaries'.
     :type mode: str
     :return: The AI Assistant's response.
     :rtype: str
     """
     global models
     description = ""
-    if mode == "search_full":
-        description = "search for the literal string '" + search_query + \
+    if mode == "search_exact_text_summaries":
+        description = "search for the literal string '" + query + \
                       "' in a collection of summaries of conversations"
-        results = history_access.summaries.get(where_document={"$contains": search_query},
+        results = history_access.summaries.get(where_document={"$contains": query},
                                                include=["metadatas", "documents"])
-    if mode == "search_details":
-        description = "search for the literal string '" + search_query + "' in a collection of conversations"
-        results = history_access.history.get(where_document={"$contains": search_query},
+    if mode == "search_exact_text_full":
+        description = "search for the literal string '" + query + "' in a collection of conversations"
+        results = history_access.history.get(where_document={"$contains": query},
                                              include=["metadatas", "documents"])
-    if mode == "similarity_full":
-        description = "search for the most similar string to '" + search_query + \
+    if mode == "vector_similarity_summaries":
+        description = "search for the most similar string to '" + query + \
                       "' in a collection of summaries of conversations"
-        results = history_access.summaries.query(query_texts=[search_query], n_results=20,
+        results = history_access.summaries.query(query_texts=[query], n_results=20,
                                                  include=["metadatas", "documents"])
-    if mode == "similarity_details":
-        description = "search for the most similar string to '" + search_query + "' in a collection of conversations"
-        results = history_access.history.query(query_texts=[search_query], n_results=20,
+    if mode == "vector_similarity_full":
+        description = "search for the most similar string to '" + query + "' in a collection of conversations"
+        results = history_access.history.query(query_texts=[query], n_results=20,
                                                include=["metadatas", "documents"])
     if mode == "schema":
         schema = {"type": "function",
@@ -139,36 +153,38 @@ def recollect(question="", search_query="", mode=""):
                                 "type": "string",
                                 "description": "The question to answer.",
                             },
-                            "search_query": {
+                            "query": {
                                 "type": "string",
-                                "description": "The query to search for. If mode is 'search_full' or 'search_details' "
-                                               "this is the literal string to search for so keep it short or you "
-                                               "will get no results. If mode is 'similarity_full' "
-                                               "or 'similarity_details' this is the string to find the most similar "
+                                "description": "The query to search for. If mode is 'search_exact_text_full' or "
+                                               "'search_exact_text_summaries' this is the literal string to search "
+                                               "for so keep it short or you will get no results. If mode is "
+                                               "'vector_similarity_full' or 'vector_similarity_summaries' this "
+                                               "is the string to find the most similar "
                                                "string to so you can make it longer.",
                             },
                             "mode": {
                                 "type": "string",
-                                "description": "The mode to search in. Can be 'search_full', 'search_details', "
-                                               "'similarity_full', or 'similarity_details'. 'search_full' searches "
+                                "description": "The mode to search in. Can be 'search_exact_text_full', "
+                                               "'search_exact_text_summaries', 'vector_similarity_full', or "
+                                               "'vector_similarity_summaries'. 'search_exact_text_full' searches "
                                                "for the literal string in a collection of summaries of conversations. "
-                                               "'search_details' searches for the literal string in a collection of "
-                                               "conversations. 'similarity_full' searches for the most similar "
-                                               "string to the query in a collection of summaries of conversations. "
-                                               "'similarity_details' searches for the most similar string to the "
-                                               "query in a collection of conversations.",
+                                               "'search_exact_text_summaries' searches for the literal string in a "
+                                               "collection of  conversations. 'vector_similarity_full' searches for "
+                                               "the most similar string to the query in a collection of summaries of "
+                                               "conversations. 'vector_similarity_summaries' searches for the most "
+                                               "similar string to the query in a collection of conversations.",
                             },
                         },
-                        "required": ["question", "search_query", "mode"],
+                        "required": ["question", "query", "mode"],
                      },
                   }
                   }
         return schema
     if mode == "examples":
-        examples = 'Examples:\n {"function_name": "recollect", "parameters": {"question": "What is the name of the' \
-                   'user\'s dog?", "search_query": "dog", "mode": "search_full"}}\n {"function_name": "recollect", ' \
-                   'parameters": {"question": "What is the town the user grew up in?", "search_query": "I was born in' \
-                   ' and grew up in ", "mode": "similarity_details"}}\n'
+        examples = 'Examples:\n{"function_name": "recollect", "parameters": {"question": "What is the name of the' \
+                   'user\'s dog?", "query": "dog", "mode": "search_exact_text_full"}}\n{"function_name": "recollect",' \
+                   ' parameters": {"question": "What is the town the user grew up in?", "query": "I was born in' \
+                   ' and grew up in ", "mode": "vector_similarity_summaries"}}\n'
         return examples
     if description == "":
         raise Exception("Invalid mode")
@@ -178,16 +194,28 @@ def recollect(question="", search_query="", mode=""):
 
     input_list = []
     for i in range(len(results['ids'])):
-        input_list.append({"role": results["metadatas"][i]["role"], "content": results["documents"][i] + "\n" +
-                           " took place on: " + convert_utc_to_local(results["metadatas"][i]["utc_time"])})
+        print(results)
+        if mode.startswith("vector_similarity"):
+            input_list.append({"role": results["metadatas"][0][i]["role"],
+                               "content": results["documents"][0][i] +
+                               "\n" + " took place on: " +
+                               convert_utc_to_local(results["metadatas"][0][i]["utc_time"])})
+        else:
+            input_list.append({"role": results["metadatas"][i]["role"], "content": results["documents"][i] + "\n" +
+                               " took place on: " + convert_utc_to_local(results["metadatas"][i]["utc_time"])})
     input_list = history_access.truncate_input_context(input_list)
 
     system_mem = [{"role": "system", "content": "You help an AI remember things by receiving a context based on a " +
                                                 description + "\n Please help it answer the following question:" +
-                                                "\n\n" + question}]
+                                                "\n\n" + question + "\n\nNOTE: If the current conversation  " +
+                                                "context does not contain the answer to the question, " +
+                                                "make sure to tell the AI to modify either modify the query and if"
+                                                "the recollection process fails to the answer the question after"
+                                                "multiple query modifications, to consider the possibility that "
+                                                "what it is trying to remember 'is not in our memories'."}]
     response = client.chat.completions.create(model=models["primary"]['name'],
                                               messages=system_mem,
-                                              temperature=models["primary"]["temperature"],
+                                              temperature=.1,
                                               max_tokens=models["primary"]["max_message"],
                                               top_p=models["primary"]["top_p"],
                                               frequency_penalty=models["primary"]["frequency_penalty"],

diff --git a/src/jarvis_conversationalist/openai_utility_functions.py b/src/jarvis_conversationalist/openai_utility_functions.py
@@ -45,6 +45,7 @@ def check_for_directed_at_me(transcript, n=1):
                      " to " + name + " directly."
 
     response = client.chat.completions.create(model="gpt-3.5-turbo",
+                                              temperature=0.1,
                                               messages=[{"role": "system", "content": system_message},
                                                         {"role": "user", "content": "\n".join(transcript)}],
                                               functions=functions,
@@ -98,6 +99,7 @@ def check_for_completion(transcript, n=1):
     " if the user is done speaking by analyzing the text below and seeing if the user has completed their thought."
 
     response = client.chat.completions.create(model="gpt-4",
+                                              temperature=0.1,
                                               messages=[{"role": "system", "content": system_message},
                                                         {"role": "user", "content": "\n".join(transcript)}],
                                               functions=functions,
@@ -111,7 +113,7 @@ def check_for_completion(transcript, n=1):
     return probabilities
 
 
-def extract_query(transcript):
+def extract_query(transcript, speaker_detection=True):
     """
     Extracts the query from the user's speech.
 
@@ -156,6 +158,19 @@ def extract_query(transcript):
                                                                                            "sure to include the " \
                                                                                            "speaker annotation for " \
                                                                                            "each subsection."
+    if not speaker_detection:
+        system_message = "You are seeing a live transcription of what is being said in a room. It is your job to " \
+                         "determine the query the user is asking by analyzing the text below and extracting word " \
+                         "for word the section of the transcript that is the query. Ignore the rest of the unrelated " \
+                         "transcript. Keep in mind that sometimes context from earlier parts of the conversation are " \
+                         "critical to understanding a query - make sure to include all the context needed to complete" \
+                         " the query well. There may be multiple people in the room or people on the phone. It's your" \
+                         " job to determine which part of the transcript is the query. The query should be a question" \
+                         " or a command or a statement directed at or highly related to " + \
+                         name + ".  It is ok if there are multiple parts of the query, or if multiple people appear " \
+                                "to be asking questions to" + name + ", it is ok to include all of those subsections " \
+                                                                     "in the query - just make sure to include the " \
+                                                                     "speaker annotation for each subsection."
 
     response = client.chat.completions.create(model="gpt-4",
                                               messages=[{"role": "system", "content": system_message},

diff --git a/src/jarvis_conversationalist/streaming_response_audio.py b/src/jarvis_conversationalist/streaming_response_audio.py
@@ -106,7 +106,8 @@ def _play_audio(self, stop_other_audio: threading.Event = None,
                 self.playing = True
 
             chunk_played = False
-            with sd.OutputStream(samplerate=sample_rate, channels=CHANNELS, dtype='int16') as stream:
+            with sd.OutputStream(samplerate=sample_rate, latency=.25,
+                                 channels=CHANNELS, dtype='int16') as stream:
                 for chunk in generator():
                     if skip and skip.is_set():
                         self.stop()

diff --git a/src/jarvis_conversationalist/text_speech.py b/src/jarvis_conversationalist/text_speech.py
@@ -207,7 +207,7 @@ def text_to_speech(text: str, model="gpt-4", stream=False):
         first_word = fixed_text.split(" ")[0]
         rest_of_text = fixed_text.replace(first_word, "")
         fixed_text = "[[rate 175]] " + first_word + "[[rate 200]] " + rest_of_text
-        text_cmd = f'[[pbas {pitch}]] [[slnc 300]]{fixed_text}[[slnc 200]]'
+        text_cmd = f'[[pbas {pitch}]] [[slnc 100]]{fixed_text}[[slnc 100]]'
         output_file = os.path.join(audio_folder, str(uuid.uuid4()) + ".wav")
         result = subprocess.run(['say']+vflag+[text_cmd, "-o", output_file, '--data-format=LEI16@22050'],
                                 capture_output=True)