diff --git a/README.md b/README.md index 37651c7133..6a39ecfaec 100644 --- a/README.md +++ b/README.md @@ -284,6 +284,7 @@ In the interactive mode, you can use the below commands to enhance your experien - `%undo`: Removes the previous user message and the AI's response from the message history. - `%tokens [prompt]`: (_Experimental_) Calculate the tokens that will be sent with the next prompt as context and estimate their cost. Optionally calculate the tokens and estimated cost of a `prompt` if one is provided. Relies on [LiteLLM's `cost_per_token()` method](https://docs.litellm.ai/docs/completion/token_usage#2-cost_per_token) for estimated costs. - `%help`: Show the help message. +- `>`: Start speech recognition mode using google's speech recognition API. Saying `exit` returns to text mode. Requires an internet connection and enabling the [speech library](https://nerdvittles.com/creating-an-api-key-for-google-speech-recognition/). ### Configuration / Profiles diff --git a/interpreter/terminal_interface/terminal_interface.py b/interpreter/terminal_interface/terminal_interface.py index 9b9f6fd7e3..e336f7f672 100644 --- a/interpreter/terminal_interface/terminal_interface.py +++ b/interpreter/terminal_interface/terminal_interface.py @@ -212,20 +212,33 @@ def terminal_interface(interpreter, message): response = input( "Would you like to run this code? (y/n)\n\n" ) + response = response.strip().lower() else: - response = input( - " Would you like to run this code? (y/n)\n\n " + # edit isn't in original prompt, but there's a branch for it just below here. Wonder what it's for. + response = ( + cli_input( + " Would you like to run this code? (yes/no) " + ) + .strip() + .lower() ) + if response == "yes": + response = "y" + if response == "edit": + response = "e" + if response == "no": + response = "n" + print("") # <- Aesthetic choice - if response.strip().lower() == "y": + if response == "y": # Create a new, identical block where the code will actually be run # Conveniently, the chunk includes everything we need to do this: active_block = CodeBlock(interpreter) active_block.margin_top = False # <- Aesthetic choice active_block.language = language active_block.code = code - elif response.strip().lower() == "e": + elif response == "e": # Edit # Create a temporary file diff --git a/interpreter/terminal_interface/utils/cli_input.py b/interpreter/terminal_interface/utils/cli_input.py index 891c1c8ab7..49fad921ca 100644 --- a/interpreter/terminal_interface/utils/cli_input.py +++ b/interpreter/terminal_interface/utils/cli_input.py @@ -1,17 +1,122 @@ +import sys +import time + +"""Return input from keyboard or speech recognition.""" + + +class SpeechRecognizer: + """Handle speech recognition using google. You must enable the API in the google cloud console.""" + + """Join chroma-dev group when logged into gmail: https://groups.google.com/a/chromium.org/g/chromium-dev""" + """Create project. Go to APIs and services -> Library. Search for speech. Enable Speech API.""" + """Go to API manager -> Credentials and create an API key.""" + + def __init__(self): + self.speech_mode = False + self.imported = False + + def speak(self, val=None) -> bool: + """Set speech mode. Called with no argument, return current value. Called with an argument, sets value.""" + if val == None: + return self.speech_mode + self.speech_mode = val + return self.speech_mode + + def import_library(self) -> bool: + """Check if the required libraries are installed, if not, load them and return loaded status.""" + if self.imported: + return True + try: + import speech_recognition as sr + + self.sr = sr + self.r = sr.Recognizer() + self.mic = sr.Microphone() + self.imported = True + return True + except ImportError: + print( + "Please install the SpeechRecognition and pyaudio libraries by executing the following commands:" + ) + if sys.platform == "darwin": + print("brew install portaudio") + if sys.platform == "linux": + print("sudo apt install python3-pyaudio") + print("If that doesn't work, you may need to install portaudio19 from source:") + print("https://www.portaudio.com/ then ./configure && make && make install.") + print("pip install SpeechRecognition pyaudio") + return False + + def listen(self) -> str: + """Listens for speech and returns the transcribed text.""" + with self.mic as source: + print("Listening...", end='', flush=True) + # This might be good. More testing needed. Might work better without it. + self.r.adjust_for_ambient_noise(source) + audio = self.r.listen(source) + + try: + text = self.r.recognize_google(audio) + print(f"\rYou said: {text}" + " " * 30) + return text + except self.sr.UnknownValueError: + print("\rCould not understand audio." + " " * 30 + "\r", end='', flush=True) + time.sleep(2) + print("\r" + " " * 30 + "\r", end='', flush=True) # Clear the line + return "" + except self.sr.RequestError as e: + print( + f"\rCould not request results from Google Speech Recognition service; {e}" + ) + return "" + + +recognizer = SpeechRecognizer() + + def cli_input(prompt: str = "") -> str: + """Return user input from keyboard or speech.""" + global recognizer start_marker = '"""' end_marker = '"""' - message = input(prompt) - - # Multi-line input mode - if start_marker in message: - lines = [message] - while True: - line = input() - lines.append(line) - if end_marker in line: - break - return "\n".join(lines) - - # Single-line input mode - return message + + while True: + if recognizer.speak(): + print(prompt, end='', flush=True) + text = recognizer.listen() + if text == "exit": + print("\rExiting speech recognition mode.") + recognizer.speak(False) + elif text: + return text + else: + print("\r" + " " * 30 + "\r", end='', flush=True) # Clear the line + else: + message = input(prompt) + # Speech recognition trigger + if message == ">": + if recognizer.import_library(): + recognizer.speak(True) + continue + recognizer.import_library() + recognizer.speak(True) + continue # Go back to the beginning of the loop for speech input + + # Multi-line input mode + if start_marker in message: + lines = [message] + while True: + line = input() + lines.append(line) + if end_marker in line: + break + return "\n".join(lines) + + # Single-line input mode + return message + + +if __name__ == "__main__": + while True: + user_input = cli_input("Enter text or '>' for speech input: ") + print(f"You entered: {user_input}")