From 7060df05a44c30e541eb09afd0d090d002214d85 Mon Sep 17 00:00:00 2001 From: kyle gaspar Date: Sat, 3 Aug 2024 21:53:31 -0400 Subject: [PATCH 1/9] added speech recognition --- .../terminal_interface/utils/cli_input.py | 85 ++++++++++++++++--- 1 file changed, 71 insertions(+), 14 deletions(-) diff --git a/interpreter/terminal_interface/utils/cli_input.py b/interpreter/terminal_interface/utils/cli_input.py index 891c1c8ab7..e7b44ad451 100644 --- a/interpreter/terminal_interface/utils/cli_input.py +++ b/interpreter/terminal_interface/utils/cli_input.py @@ -1,17 +1,74 @@ +import speech_recognition as sr +import time + +speech_mode = False + + +class SpeechRecognizer: + def __init__(self, wake_word="jarvis"): + self.r = sr.Recognizer() + self.mic = sr.Microphone() + self.wake_word = wake_word + + def listen_for_speech(self): + """Listens for speech and returns the transcribed text.""" + with self.mic as source: + print("Listening...") + # self.r.adjust_for_ambient_noise(source) + audio = self.r.listen(source) + + try: + text = self.r.recognize_google(audio) + text = text[text.find(" ") + 1 :] if " " in text else text # Get rid of activation word + print(f"You said: {text}") + return text + except sr.UnknownValueError: + print("Could not understand audio") + return "" + except sr.RequestError as e: + print( + f"Could not request results from Google Speech Recognition service; {e}" + ) + return "" + + def cli_input(prompt: str = "") -> str: + global speech_mode + start_marker = '"""' end_marker = '"""' - message = input(prompt) - - # Multi-line input mode - if start_marker in message: - lines = [message] - while True: - line = input() - lines.append(line) - if end_marker in line: - break - return "\n".join(lines) - - # Single-line input mode - return message + + while True: + if speech_mode: + recognizer = SpeechRecognizer() + text = recognizer.listen_for_speech() + if text == "exit": + print("Exiting speech recognition mode.") + speech_mode = False + elif text: + return text + else: + message = input(prompt) + # Speech recognition trigger + if message == ">": + speech_mode = True + continue # Go back to the beginning of the loop for speech input + + # Multi-line input mode + if start_marker in message: + lines = [message] + while True: + line = input() + lines.append(line) + if end_marker in line: + break + return "\n".join(lines) + + # Single-line input mode + return message + + +if __name__ == "__main__": + while True: + user_input = cli_input("Enter text or '>' for speech input: ") + print(f"You entered: {user_input}") From 90f331d16a3fa6b8ddeca4b78d90cfdcba40a699 Mon Sep 17 00:00:00 2001 From: kyle gaspar Date: Sun, 4 Aug 2024 11:51:55 -0400 Subject: [PATCH 2/9] add vox to y/n code execution --- .../terminal_interface/terminal_interface.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/interpreter/terminal_interface/terminal_interface.py b/interpreter/terminal_interface/terminal_interface.py index 9b9f6fd7e3..6ae1ce6b1d 100644 --- a/interpreter/terminal_interface/terminal_interface.py +++ b/interpreter/terminal_interface/terminal_interface.py @@ -213,9 +213,22 @@ def terminal_interface(interpreter, message): "Would you like to run this code? (y/n)\n\n" ) else: - response = input( - " Would you like to run this code? (y/n)\n\n " - ) + print(" Would you like to run this code? (yes/no)\n\n ") + # response = input( + # " 666Would you like to run this code? (y/n)\n\n " + # ) + response = cli_input().strip().lower() + + print(f"-->{response}<--") + + if response == 'yes': + response = 'y' + if response == 'edit': + response = 'e' + if response == 'no': + response = 'n' + + print("") # <- Aesthetic choice if response.strip().lower() == "y": From 579022cd71f6c6ac13eefdb4f0f287a8a670cefe Mon Sep 17 00:00:00 2001 From: kyle gaspar Date: Sun, 4 Aug 2024 11:52:48 -0400 Subject: [PATCH 3/9] catch speech library import error --- .../terminal_interface/utils/cli_input.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/interpreter/terminal_interface/utils/cli_input.py b/interpreter/terminal_interface/utils/cli_input.py index e7b44ad451..3fa2915843 100644 --- a/interpreter/terminal_interface/utils/cli_input.py +++ b/interpreter/terminal_interface/utils/cli_input.py @@ -1,19 +1,34 @@ -import speech_recognition as sr +#import speech_recognition as sr import time +import sys speech_mode = False class SpeechRecognizer: def __init__(self, wake_word="jarvis"): + global sr + self.import_library() self.r = sr.Recognizer() self.mic = sr.Microphone() self.wake_word = wake_word + def import_library(self): + """Check if the required libraries are installed.""" + global sr + try: + import speech_recognition as sr + except ImportError: + print("Please install the SpeechRecognition and pyaudio libraries.") + if sys.platform == "darwin": + print("On Mac, install portaudio with: brew install portaudio") + print("pip install SpeechRecognition pyaudio") + def listen_for_speech(self): """Listens for speech and returns the transcribed text.""" with self.mic as source: print("Listening...") + # This might be good. More testing needed. Seemed to work better without it. # self.r.adjust_for_ambient_noise(source) audio = self.r.listen(source) From cffcd9a725823a70b0fe032397ad5f3199f09c2c Mon Sep 17 00:00:00 2001 From: kyle gaspar Date: Mon, 5 Aug 2024 07:25:56 -0400 Subject: [PATCH 4/9] code/namespace cleanup. added documentation --- .../terminal_interface/terminal_interface.py | 27 ++++--- .../terminal_interface/utils/cli_input.py | 76 +++++++++++++------ 2 files changed, 67 insertions(+), 36 deletions(-) diff --git a/interpreter/terminal_interface/terminal_interface.py b/interpreter/terminal_interface/terminal_interface.py index 6ae1ce6b1d..866f68859b 100644 --- a/interpreter/terminal_interface/terminal_interface.py +++ b/interpreter/terminal_interface/terminal_interface.py @@ -213,21 +213,24 @@ def terminal_interface(interpreter, message): "Would you like to run this code? (y/n)\n\n" ) else: - print(" Would you like to run this code? (yes/no)\n\n ") + # print(" Would you like to run this code? (yes/no)\n\n ") # response = input( # " 666Would you like to run this code? (y/n)\n\n " # ) - response = cli_input().strip().lower() - - print(f"-->{response}<--") - - if response == 'yes': - response = 'y' - if response == 'edit': - response = 'e' - if response == 'no': - response = 'n' - + # edit isn't in original prompt, but there's a branch for it just below here. Wonder what it's for. + response = ( + cli_input( + " Would you like to run this code? (yes/no)\n\n " + ) + .strip() + .lower() + ) + if response == "yes": + response = "y" + if response == "edit": + response = "e" + if response == "no": + response = "n" print("") # <- Aesthetic choice diff --git a/interpreter/terminal_interface/utils/cli_input.py b/interpreter/terminal_interface/utils/cli_input.py index 3fa2915843..02b66e163b 100644 --- a/interpreter/terminal_interface/utils/cli_input.py +++ b/interpreter/terminal_interface/utils/cli_input.py @@ -1,72 +1,100 @@ -#import speech_recognition as sr -import time import sys -speech_mode = False +"""Return input from keyboard or speech recognition.""" class SpeechRecognizer: + """Handle speech recognition using google. You must enable the API in the google cloud console.""" + + """Join chroma-dev group when logged into gmail: https://groups.google.com/a/chromium.org/g/chromium-dev""" + """Create project. Go to APIs and services -> Library. Search for speech. Enable Speech API.""" + """Go to API manager -> Credentials and create an API key.""" + def __init__(self, wake_word="jarvis"): - global sr - self.import_library() - self.r = sr.Recognizer() - self.mic = sr.Microphone() + self.speech_mode = False + self.imported = False self.wake_word = wake_word - def import_library(self): - """Check if the required libraries are installed.""" - global sr + def speak(self, val=None) -> bool: + """Set speech mode. Called with no argument, return current value. Called with an argument, sets value.""" + if val == None: + return self.speech_mode + self.speech_mode = val + return self.speech_mode + + def import_library(self) -> bool: + """Check if the required libraries are installed, if not, load them and return loaded status.""" + if self.imported: + return True try: import speech_recognition as sr + + self.sr = sr + self.r = sr.Recognizer() + self.mic = sr.Microphone() + self.imported = True + return True except ImportError: - print("Please install the SpeechRecognition and pyaudio libraries.") + print( + "Please install the SpeechRecognition and pyaudio libraries by executing the following commands:" + ) if sys.platform == "darwin": - print("On Mac, install portaudio with: brew install portaudio") + print("brew install portaudio") print("pip install SpeechRecognition pyaudio") + return False - def listen_for_speech(self): + def listen_for_speech(self) -> str: """Listens for speech and returns the transcribed text.""" with self.mic as source: print("Listening...") - # This might be good. More testing needed. Seemed to work better without it. - # self.r.adjust_for_ambient_noise(source) + # This might be good. More testing needed. Might work better without it. + self.r.adjust_for_ambient_noise(source) audio = self.r.listen(source) try: text = self.r.recognize_google(audio) - text = text[text.find(" ") + 1 :] if " " in text else text # Get rid of activation word + text = ( + text[text.find(" ") + 1 :] if " " in text else text + ) # Get rid of activation word print(f"You said: {text}") return text - except sr.UnknownValueError: + except self.sr.UnknownValueError: print("Could not understand audio") return "" - except sr.RequestError as e: + except self.sr.RequestError as e: print( f"Could not request results from Google Speech Recognition service; {e}" ) return "" -def cli_input(prompt: str = "") -> str: - global speech_mode +recognizer = SpeechRecognizer() + +def cli_input(prompt: str = "") -> str: + """Return user input from keyboard or speech.""" + global recognizer start_marker = '"""' end_marker = '"""' while True: - if speech_mode: - recognizer = SpeechRecognizer() + if recognizer.speak(): + print(prompt) text = recognizer.listen_for_speech() if text == "exit": print("Exiting speech recognition mode.") - speech_mode = False + recognizer.speak(False) elif text: return text else: message = input(prompt) # Speech recognition trigger if message == ">": - speech_mode = True + if recognizer.import_library(): + recognizer.speak(True) + continue + recognizer.import_library() + recognizer.speak(True) continue # Go back to the beginning of the loop for speech input # Multi-line input mode From 3b9b77eeae9eafb10bc62e24c7ebcb6a72818d6e Mon Sep 17 00:00:00 2001 From: kyle gaspar Date: Mon, 5 Aug 2024 09:01:28 -0400 Subject: [PATCH 5/9] code cleanup; push string cleaning up execution chain. --- interpreter/terminal_interface/terminal_interface.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/interpreter/terminal_interface/terminal_interface.py b/interpreter/terminal_interface/terminal_interface.py index 866f68859b..66b6c278f9 100644 --- a/interpreter/terminal_interface/terminal_interface.py +++ b/interpreter/terminal_interface/terminal_interface.py @@ -212,11 +212,8 @@ def terminal_interface(interpreter, message): response = input( "Would you like to run this code? (y/n)\n\n" ) + response = response.strip().lower() else: - # print(" Would you like to run this code? (yes/no)\n\n ") - # response = input( - # " 666Would you like to run this code? (y/n)\n\n " - # ) # edit isn't in original prompt, but there's a branch for it just below here. Wonder what it's for. response = ( cli_input( @@ -234,14 +231,14 @@ def terminal_interface(interpreter, message): print("") # <- Aesthetic choice - if response.strip().lower() == "y": + if response == "y": # Create a new, identical block where the code will actually be run # Conveniently, the chunk includes everything we need to do this: active_block = CodeBlock(interpreter) active_block.margin_top = False # <- Aesthetic choice active_block.language = language active_block.code = code - elif response.strip().lower() == "e": + elif response == "e": # Edit # Create a temporary file From b8a4eab58f63574c60fc34e4bb6a9ce02ce81448 Mon Sep 17 00:00:00 2001 From: kyle gaspar Date: Mon, 5 Aug 2024 10:30:55 -0400 Subject: [PATCH 6/9] Informational messages such as speech not understood will stay on one line instead of continuously scrolling the display --- .../terminal_interface/terminal_interface.py | 2 +- .../terminal_interface/utils/cli_input.py | 25 +++++++++++++------ 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/interpreter/terminal_interface/terminal_interface.py b/interpreter/terminal_interface/terminal_interface.py index 66b6c278f9..e336f7f672 100644 --- a/interpreter/terminal_interface/terminal_interface.py +++ b/interpreter/terminal_interface/terminal_interface.py @@ -217,7 +217,7 @@ def terminal_interface(interpreter, message): # edit isn't in original prompt, but there's a branch for it just below here. Wonder what it's for. response = ( cli_input( - " Would you like to run this code? (yes/no)\n\n " + " Would you like to run this code? (yes/no) " ) .strip() .lower() diff --git a/interpreter/terminal_interface/utils/cli_input.py b/interpreter/terminal_interface/utils/cli_input.py index 02b66e163b..00169365ee 100644 --- a/interpreter/terminal_interface/utils/cli_input.py +++ b/interpreter/terminal_interface/utils/cli_input.py @@ -1,4 +1,5 @@ import sys +import time """Return input from keyboard or speech recognition.""" @@ -40,13 +41,17 @@ def import_library(self) -> bool: ) if sys.platform == "darwin": print("brew install portaudio") + if sys.platform == "linux": + print("sudo apt install python3-pyaudio") + print("If that doesn't work, you may need to install portaudio19 from source:") + print("https://www.portaudio.com/ then ./configure && make && make install.") print("pip install SpeechRecognition pyaudio") return False - def listen_for_speech(self) -> str: + def listen(self) -> str: """Listens for speech and returns the transcribed text.""" with self.mic as source: - print("Listening...") + print("Listening...", end='', flush=True) # This might be good. More testing needed. Might work better without it. self.r.adjust_for_ambient_noise(source) audio = self.r.listen(source) @@ -56,14 +61,16 @@ def listen_for_speech(self) -> str: text = ( text[text.find(" ") + 1 :] if " " in text else text ) # Get rid of activation word - print(f"You said: {text}") + print(f"\rYou said: {text}") return text except self.sr.UnknownValueError: - print("Could not understand audio") + print("\rCould not understand audio." + " " * 30 + "\r", end='', flush=True) + time.sleep(2) + print("\r" + " " * 30 + "\r", end='', flush=True) # Clear the line return "" except self.sr.RequestError as e: print( - f"Could not request results from Google Speech Recognition service; {e}" + f"\rCould not request results from Google Speech Recognition service; {e}" ) return "" @@ -79,13 +86,15 @@ def cli_input(prompt: str = "") -> str: while True: if recognizer.speak(): - print(prompt) - text = recognizer.listen_for_speech() + print(prompt, end='', flush=True) + text = recognizer.listen() if text == "exit": - print("Exiting speech recognition mode.") + print("\rExiting speech recognition mode.") recognizer.speak(False) elif text: return text + else: + print("\r" + " " * 30 + "\r", end='', flush=True) # Clear the line else: message = input(prompt) # Speech recognition trigger From 4f4204a1949def53ddce71d3c11b1585d58f9fa2 Mon Sep 17 00:00:00 2001 From: kyle gaspar Date: Mon, 5 Aug 2024 13:49:00 -0400 Subject: [PATCH 7/9] cleanup --- interpreter/terminal_interface/utils/cli_input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/interpreter/terminal_interface/utils/cli_input.py b/interpreter/terminal_interface/utils/cli_input.py index 00169365ee..ad186fbba2 100644 --- a/interpreter/terminal_interface/utils/cli_input.py +++ b/interpreter/terminal_interface/utils/cli_input.py @@ -61,7 +61,7 @@ def listen(self) -> str: text = ( text[text.find(" ") + 1 :] if " " in text else text ) # Get rid of activation word - print(f"\rYou said: {text}") + print(f"\rYou said: {text}" + " " * 30) return text except self.sr.UnknownValueError: print("\rCould not understand audio." + " " * 30 + "\r", end='', flush=True) From dd650f339a07f87c9f34407d9dde6d18123ca1c0 Mon Sep 17 00:00:00 2001 From: kyle gaspar Date: Tue, 6 Aug 2024 08:44:37 -0400 Subject: [PATCH 8/9] got rid of wake word --- interpreter/terminal_interface/utils/cli_input.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/interpreter/terminal_interface/utils/cli_input.py b/interpreter/terminal_interface/utils/cli_input.py index ad186fbba2..49fad921ca 100644 --- a/interpreter/terminal_interface/utils/cli_input.py +++ b/interpreter/terminal_interface/utils/cli_input.py @@ -11,10 +11,9 @@ class SpeechRecognizer: """Create project. Go to APIs and services -> Library. Search for speech. Enable Speech API.""" """Go to API manager -> Credentials and create an API key.""" - def __init__(self, wake_word="jarvis"): + def __init__(self): self.speech_mode = False self.imported = False - self.wake_word = wake_word def speak(self, val=None) -> bool: """Set speech mode. Called with no argument, return current value. Called with an argument, sets value.""" @@ -58,9 +57,6 @@ def listen(self) -> str: try: text = self.r.recognize_google(audio) - text = ( - text[text.find(" ") + 1 :] if " " in text else text - ) # Get rid of activation word print(f"\rYou said: {text}" + " " * 30) return text except self.sr.UnknownValueError: From a0758fc4a9c6baada511a3dcad969b80b45cad88 Mon Sep 17 00:00:00 2001 From: kyle gaspar Date: Tue, 6 Aug 2024 09:17:44 -0400 Subject: [PATCH 9/9] added > to enable speech recognnition --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 37651c7133..6a39ecfaec 100644 --- a/README.md +++ b/README.md @@ -284,6 +284,7 @@ In the interactive mode, you can use the below commands to enhance your experien - `%undo`: Removes the previous user message and the AI's response from the message history. - `%tokens [prompt]`: (_Experimental_) Calculate the tokens that will be sent with the next prompt as context and estimate their cost. Optionally calculate the tokens and estimated cost of a `prompt` if one is provided. Relies on [LiteLLM's `cost_per_token()` method](https://docs.litellm.ai/docs/completion/token_usage#2-cost_per_token) for estimated costs. - `%help`: Show the help message. +- `>`: Start speech recognition mode using google's speech recognition API. Saying `exit` returns to text mode. Requires an internet connection and enabling the [speech library](https://nerdvittles.com/creating-an-api-key-for-google-speech-recognition/). ### Configuration / Profiles