Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Speech recognition #1386

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ In the interactive mode, you can use the below commands to enhance your experien
- `%undo`: Removes the previous user message and the AI's response from the message history.
- `%tokens [prompt]`: (_Experimental_) Calculate the tokens that will be sent with the next prompt as context and estimate their cost. Optionally calculate the tokens and estimated cost of a `prompt` if one is provided. Relies on [LiteLLM's `cost_per_token()` method](https://docs.litellm.ai/docs/completion/token_usage#2-cost_per_token) for estimated costs.
- `%help`: Show the help message.
- `>`: Start speech recognition mode using google's speech recognition API. Saying `exit` returns to text mode. Requires an internet connection and enabling the [speech library](https://nerdvittles.com/creating-an-api-key-for-google-speech-recognition/).

### Configuration / Profiles

Expand Down
21 changes: 17 additions & 4 deletions interpreter/terminal_interface/terminal_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,20 +212,33 @@ def terminal_interface(interpreter, message):
response = input(
"Would you like to run this code? (y/n)\n\n"
)
response = response.strip().lower()
else:
response = input(
" Would you like to run this code? (y/n)\n\n "
# edit isn't in original prompt, but there's a branch for it just below here. Wonder what it's for.
response = (
cli_input(
" Would you like to run this code? (yes/no) "
)
.strip()
.lower()
)
if response == "yes":
response = "y"
if response == "edit":
response = "e"
if response == "no":
response = "n"

print("") # <- Aesthetic choice

if response.strip().lower() == "y":
if response == "y":
# Create a new, identical block where the code will actually be run
# Conveniently, the chunk includes everything we need to do this:
active_block = CodeBlock(interpreter)
active_block.margin_top = False # <- Aesthetic choice
active_block.language = language
active_block.code = code
elif response.strip().lower() == "e":
elif response == "e":
# Edit

# Create a temporary file
Expand Down
133 changes: 119 additions & 14 deletions interpreter/terminal_interface/utils/cli_input.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,122 @@
import sys
import time

"""Return input from keyboard or speech recognition."""


class SpeechRecognizer:
"""Handle speech recognition using google. You must enable the API in the google cloud console."""

"""Join chroma-dev group when logged into gmail: https://groups.google.com/a/chromium.org/g/chromium-dev"""
"""Create project. Go to APIs and services -> Library. Search for speech. Enable Speech API."""
"""Go to API manager -> Credentials and create an API key."""

def __init__(self):
self.speech_mode = False
self.imported = False

def speak(self, val=None) -> bool:
"""Set speech mode. Called with no argument, return current value. Called with an argument, sets value."""
if val == None:
return self.speech_mode
self.speech_mode = val
return self.speech_mode

def import_library(self) -> bool:
"""Check if the required libraries are installed, if not, load them and return loaded status."""
if self.imported:
return True
try:
import speech_recognition as sr

self.sr = sr
self.r = sr.Recognizer()
self.mic = sr.Microphone()
self.imported = True
return True
except ImportError:
print(
"Please install the SpeechRecognition and pyaudio libraries by executing the following commands:"
)
if sys.platform == "darwin":
print("brew install portaudio")
if sys.platform == "linux":
print("sudo apt install python3-pyaudio")
print("If that doesn't work, you may need to install portaudio19 from source:")
print("https://www.portaudio.com/ then ./configure && make && make install.")
print("pip install SpeechRecognition pyaudio")
return False

def listen(self) -> str:
"""Listens for speech and returns the transcribed text."""
with self.mic as source:
print("Listening...", end='', flush=True)
# This might be good. More testing needed. Might work better without it.
self.r.adjust_for_ambient_noise(source)
audio = self.r.listen(source)

try:
text = self.r.recognize_google(audio)
print(f"\rYou said: {text}" + " " * 30)
return text
except self.sr.UnknownValueError:
print("\rCould not understand audio." + " " * 30 + "\r", end='', flush=True)
time.sleep(2)
print("\r" + " " * 30 + "\r", end='', flush=True) # Clear the line
return ""
except self.sr.RequestError as e:
print(
f"\rCould not request results from Google Speech Recognition service; {e}"
)
return ""


recognizer = SpeechRecognizer()


def cli_input(prompt: str = "") -> str:
"""Return user input from keyboard or speech."""
global recognizer
start_marker = '"""'
end_marker = '"""'
message = input(prompt)

# Multi-line input mode
if start_marker in message:
lines = [message]
while True:
line = input()
lines.append(line)
if end_marker in line:
break
return "\n".join(lines)

# Single-line input mode
return message

while True:
if recognizer.speak():
print(prompt, end='', flush=True)
text = recognizer.listen()
if text == "exit":
print("\rExiting speech recognition mode.")
recognizer.speak(False)
elif text:
return text
else:
print("\r" + " " * 30 + "\r", end='', flush=True) # Clear the line
else:
message = input(prompt)
# Speech recognition trigger
if message == ">":
if recognizer.import_library():
recognizer.speak(True)
continue
recognizer.import_library()
recognizer.speak(True)
continue # Go back to the beginning of the loop for speech input

# Multi-line input mode
if start_marker in message:
lines = [message]
while True:
line = input()
lines.append(line)
if end_marker in line:
break
return "\n".join(lines)

# Single-line input mode
return message


if __name__ == "__main__":
while True:
user_input = cli_input("Enter text or '>' for speech input: ")
print(f"You entered: {user_input}")