Chat: support input from microphone

rmackay9 · Dec 7, 2023 · bc06bff · bc06bff
1 parent 7451c7f
commit bc06bff
Show file tree

Hide file tree

Showing 2 changed files with 81 additions and 3 deletions.
diff --git a/MAVProxy/modules/mavproxy_chat/chat_voice_to_text.py b/MAVProxy/modules/mavproxy_chat/chat_voice_to_text.py
@@ -0,0 +1,67 @@
+'''
+AI Chat Module voice-to-text class
+Randy Mackay, December 2023
+'''
+
+import time
+import pyaudio  # install using, "sudo apt-get install python3-pyaudio"
+import wave     # install with "pip3 install wave"
+from openai import OpenAI
+
+class chat_voice_to_text():
+    def __init__(self):
+        # create connection object
+        self.client = OpenAI()
+
+    # record audio from microphone
+    # returns filename of recording or None if failed
+    def record_audio(self):
+        # Initialize PyAudio
+        p = pyaudio.PyAudio()
+
+        # Open stream
+        try:
+            stream = p.open(format=pyaudio.paInt16, channels=1, rate=44100, input=True, frames_per_buffer=1024)
+            print("recording audio: opened stream")
+        except:
+            print("recording audio: failed to open stream")
+            return None
+
+        # calculate time recording should stop
+        curr_time = time.time()
+        time_stop = curr_time + 5
+
+        # record until specified time
+        frames = []
+        while curr_time < time_stop:
+            data = stream.read(1024)
+            frames.append(data)
+            curr_time = time.time()
+            print("recording audio: reading t:" + str(curr_time))
+        print("recording audio: data collection complete")
+
+        # Stop and close the stream
+        stream.stop_stream()
+        stream.close()
+        p.terminate()
+
+        # Save audio file
+        wf = wave.open("recording.wav", "wb")
+        wf.setnchannels(1)
+        wf.setsampwidth(pyaudio.PyAudio().get_sample_size(pyaudio.paInt16))
+        wf.setframerate(44100)
+        wf.writeframes(b''.join(frames))
+        wf.close()
+        return "recording.wav"
+
+    # convert audio to text
+    # returns transcribed text on success or None if failed
+    def convert_audio_to_text(self, audio_filename):
+        # Process with Whisper
+        audio_file = open(audio_filename, "rb")
+        client = OpenAI()
+        transcript = client.audio.transcriptions.create(
+            model="whisper-1", 
+            file=audio_file, 
+            response_format="text")
+        return transcript.text
diff --git a/MAVProxy/modules/mavproxy_chat/chat_window.py b/MAVProxy/modules/mavproxy_chat/chat_window.py
@@ -10,7 +10,7 @@
 from MAVProxy.modules.lib.mp_settings import MPSetting
 from MAVProxy.modules.lib.wx_loader import wx
 from MAVProxy.modules.lib import mp_util
-from MAVProxy.modules.mavproxy_chat import chat_openai
+from MAVProxy.modules.mavproxy_chat import chat_openai, chat_voice_to_text
 from threading import Thread
 from pymavlink import mavutil
 import time
@@ -60,6 +60,9 @@ def __init__(self, mpstate):
         # create chat_openai object
         self.chat_openai = chat_openai.chat_openai(self.mpstate)
 
+        # create chat_voice_to_text object
+        self.chat_voice_to_text = chat_voice_to_text.chat_voice_to_text()
+
         # run chat window in a separate thread
         self.thread = Thread(target=self.idle_task)
         self.thread.start()
@@ -94,8 +97,16 @@ def apikey_set_button_click(self, event):
 
     # record button clicked
     def record_button_click(self, event):
-        print("Record button not yet supported")
-        self.text_reply.SetValue("Record button not yet supported")
+        rec_filename = self.chat_voice_to_text.record_audio()
+        if rec_filename is None:
+            self.text_input.SetValue("Recording failed")
+        else:
+            self.text_input.SetValue("Recorded to " + rec_filename)
+        text = self.chat_voice_to_text.convert_audio_to_text(rec_filename)
+        if text is None:
+            self.text_input.SetValue("Audio to text conversion failed")
+        else:
+            self.text_input.SetValue(text)
 
     # send button clicked
     def send_button_click(self, event):