OpenInterpreter · CodeAKrome · Aug 4, 2024 · Aug 4, 2024 · Aug 4, 2024 · Aug 5, 2024
diff --git a/README.md b/README.md
@@ -284,6 +284,7 @@ In the interactive mode, you can use the below commands to enhance your experien
 - `%undo`: Removes the previous user message and the AI's response from the message history.
 - `%tokens [prompt]`: (_Experimental_) Calculate the tokens that will be sent with the next prompt as context and estimate their cost. Optionally calculate the tokens and estimated cost of a `prompt` if one is provided. Relies on [LiteLLM's `cost_per_token()` method](https://docs.litellm.ai/docs/completion/token_usage#2-cost_per_token) for estimated costs.
 - `%help`: Show the help message.
+- `>`: Start speech recognition mode using google's speech recognition API. Saying `exit` returns to text mode. Requires an internet connection and enabling the [speech library](https://nerdvittles.com/creating-an-api-key-for-google-speech-recognition/).
 
 ### Configuration / Profiles
 

diff --git a/interpreter/terminal_interface/terminal_interface.py b/interpreter/terminal_interface/terminal_interface.py
@@ -212,20 +212,33 @@ def terminal_interface(interpreter, message):
                             response = input(
                                 "Would you like to run this code? (y/n)\n\n"
                             )
+                            response = response.strip().lower()
                         else:
-                            response = input(
-                                "  Would you like to run this code? (y/n)\n\n  "
+                            # edit isn't in original prompt, but there's a branch for it just below here. Wonder what it's for.
+                            response = (
+                                cli_input(
+                                    "  Would you like to run this code? (yes/no)  "
+                                )
+                                .strip()
+                                .lower()
                             )
+                        if response == "yes":
+                            response = "y"
+                        if response == "edit":
+                            response = "e"
+                        if response == "no":
+                            response = "n"
+
                         print("")  # <- Aesthetic choice
 
-                        if response.strip().lower() == "y":
+                        if response == "y":
                             # Create a new, identical block where the code will actually be run
                             # Conveniently, the chunk includes everything we need to do this:
                             active_block = CodeBlock(interpreter)
                             active_block.margin_top = False  # <- Aesthetic choice
                             active_block.language = language
                             active_block.code = code
-                        elif response.strip().lower() == "e":
+                        elif response == "e":
                             # Edit
 
                             # Create a temporary file

diff --git a/interpreter/terminal_interface/utils/cli_input.py b/interpreter/terminal_interface/utils/cli_input.py
@@ -1,17 +1,122 @@
+import sys
+import time
+
+"""Return input from keyboard or speech recognition."""
+
+
+class SpeechRecognizer:
+    """Handle speech recognition using google. You must enable the API in the google cloud console."""
+
+    """Join chroma-dev group when logged into gmail: https://groups.google.com/a/chromium.org/g/chromium-dev"""
+    """Create project. Go to APIs and services -> Library. Search for speech. Enable Speech API."""
+    """Go to API manager -> Credentials and create an API key."""
+
+    def __init__(self):
+        self.speech_mode = False
+        self.imported = False
+
+    def speak(self, val=None) -> bool:
+        """Set speech mode. Called with no argument, return current value. Called with an argument, sets value."""
+        if val == None:
+            return self.speech_mode
+        self.speech_mode = val
+        return self.speech_mode
+
+    def import_library(self) -> bool:
+        """Check if the required libraries are installed, if not, load them and return loaded status."""
+        if self.imported:
+            return True
+        try:
+            import speech_recognition as sr
+
+            self.sr = sr
+            self.r = sr.Recognizer()
+            self.mic = sr.Microphone()
+            self.imported = True
+            return True
+        except ImportError:
+            print(
+                "Please install the SpeechRecognition and pyaudio libraries by executing the following commands:"
+            )
+            if sys.platform == "darwin":
+                print("brew install portaudio")
+            if sys.platform == "linux":
+                print("sudo apt install python3-pyaudio")
+                print("If that doesn't work, you may need to install portaudio19 from source:")
+                print("https://www.portaudio.com/ then ./configure && make && make install.")
+            print("pip install SpeechRecognition pyaudio")
+            return False
+
+    def listen(self) -> str:
+        """Listens for speech and returns the transcribed text."""
+        with self.mic as source:
+            print("Listening...", end='', flush=True)
+            # This might be good. More testing needed. Might work better without it.
+            self.r.adjust_for_ambient_noise(source)
+            audio = self.r.listen(source)
+
+        try:
+            text = self.r.recognize_google(audio)
+            print(f"\rYou said: {text}" + " " * 30)
+            return text
+        except self.sr.UnknownValueError:
+            print("\rCould not understand audio." + " " * 30 + "\r", end='', flush=True)
+            time.sleep(2)
+            print("\r" + " " * 30 + "\r", end='', flush=True)  # Clear the line
+            return ""
+        except self.sr.RequestError as e:
+            print(
+                f"\rCould not request results from Google Speech Recognition service; {e}"
+            )
+            return ""
+
+
+recognizer = SpeechRecognizer()
+
+
 def cli_input(prompt: str = "") -> str:
+    """Return user input from keyboard or speech."""
+    global recognizer
     start_marker = '"""'
     end_marker = '"""'
-    message = input(prompt)
-
-    # Multi-line input mode
-    if start_marker in message:
-        lines = [message]
-        while True:
-            line = input()
-            lines.append(line)
-            if end_marker in line:
-                break
-        return "\n".join(lines)
-
-    # Single-line input mode
-    return message
+
+    while True:
+        if recognizer.speak():
+            print(prompt, end='', flush=True)
+            text = recognizer.listen()
+            if text == "exit":
+                print("\rExiting speech recognition mode.")
+                recognizer.speak(False)
+            elif text:
+                return text
+            else:
+                print("\r" + " " * 30 + "\r", end='', flush=True)  # Clear the line
+        else:
+            message = input(prompt)
+            # Speech recognition trigger
+            if message == ">":
+                if recognizer.import_library():
+                    recognizer.speak(True)
+                    continue
+                recognizer.import_library()
+                recognizer.speak(True)
+                continue  # Go back to the beginning of the loop for speech input
+
+            # Multi-line input mode
+            if start_marker in message:
+                lines = [message]
+                while True:
+                    line = input()
+                    lines.append(line)
+                    if end_marker in line:
+                        break
+                return "\n".join(lines)
+
+            # Single-line input mode
+            return message
+
+
+if __name__ == "__main__":
+    while True:
+        user_input = cli_input("Enter text or '>' for speech input: ")
+        print(f"You entered: {user_input}")