forked from foges/whisper-dictation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
whisper-dictation.py
216 lines (173 loc) · 7.87 KB
/
whisper-dictation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import argparse
import time
import threading
import pyaudio
import numpy as np
import rumps
from pynput import keyboard
import platform
from whispercpp import Whisper
class SpeechTranscriber:
def __init__(self, whisper: Whisper):
self.whisper = whisper
self.pykeyboard = keyboard.Controller()
def transcribe(self, audio_data, language=None):
result = self.whisper.transcribe(audio_data)
try:
self.pykeyboard.type(result)
time.sleep(0.0025)
except:
pass
class Recorder:
def __init__(self, transcriber):
self.recording = False
self.transcriber = transcriber
def start(self, language=None):
thread = threading.Thread(target=self._record_impl, args=(language,))
thread.start()
def stop(self):
self.recording = False
def _record_impl(self, language):
self.recording = True
frames_per_buffer = 1024
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
channels=1,
rate=16000,
frames_per_buffer=frames_per_buffer,
input=True)
frames = []
while self.recording:
data = stream.read(frames_per_buffer)
frames.append(data)
stream.stop_stream()
stream.close()
p.terminate()
audio_data = np.frombuffer(b''.join(frames), dtype=np.int16)
audio_data_fp32 = audio_data.astype(np.float32) / 32768.0
self.transcriber.transcribe(audio_data_fp32, language)
class GlobalKeyListener:
def __init__(self, app, key_combination):
self.app = app
self.key1, self.key2 = self.parse_key_combination(key_combination)
self.key1_pressed = False
self.key2_pressed = False
def parse_key_combination(self, key_combination):
key1_name, key2_name = key_combination.split('+')
key1 = getattr(keyboard.Key, key1_name)
key2 = getattr(keyboard.Key, key2_name)
return key1, key2
def on_key_press(self, key):
if key == self.key1:
self.key1_pressed = True
elif key == self.key2:
self.key2_pressed = True
if self.key1_pressed and self.key2_pressed:
self.app.toggle()
def on_key_release(self, key):
if key == self.key1:
self.key1_pressed = False
elif key == self.key2:
self.key2_pressed = False
class StatusBarApp(rumps.App):
def __init__(self, recorder, languages=None, max_time=None):
super().__init__("whisper", "⏯")
self.languages = languages
self.current_language = languages[0] if languages is not None else None
menu = [
'Start Recording',
'Stop Recording',
None,
]
if languages is not None:
for lang in languages:
callback = self.change_language if lang != self.current_language else None
menu.append(rumps.MenuItem(lang, callback=callback))
menu.append(None)
self.menu = menu
self.menu['Stop Recording'].set_callback(None)
self.started = False
self.recorder = recorder
self.max_time = max_time
self.timer = None
self.elapsed_time = 0
def change_language(self, sender):
self.current_language = sender.title
for lang in self.languages:
self.menu[lang].set_callback(self.change_language if lang != self.current_language else None)
@rumps.clicked('Start Recording')
def start_app(self, _):
print('Listening...')
self.started = True
self.menu['Start Recording'].set_callback(None)
self.menu['Stop Recording'].set_callback(self.stop_app)
self.recorder.start(self.current_language)
if self.max_time is not None:
self.timer = threading.Timer(self.max_time, lambda: self.stop_app(None))
self.timer.start()
self.start_time = time.time()
self.update_title()
@rumps.clicked('Stop Recording')
def stop_app(self, _):
if not self.started:
return
if self.timer is not None:
self.timer.cancel()
print('Transcribing...')
self.title = "⏯"
self.started = False
self.menu['Stop Recording'].set_callback(None)
self.menu['Start Recording'].set_callback(self.start_app)
self.recorder.stop()
print('Done.\n')
def update_title(self):
if self.started:
self.elapsed_time = int(time.time() - self.start_time)
minutes, seconds = divmod(self.elapsed_time, 60)
self.title = f"({minutes:02d}:{seconds:02d}) 🔴"
threading.Timer(1, self.update_title).start()
def toggle(self):
if self.started:
self.stop_app(None)
else:
self.start_app(None)
def parse_args():
parser = argparse.ArgumentParser(
description='Dictation app using the OpenAI whisper ASR model. By default the keyboard shortcut cmd+option '
'starts and stops dictation')
parser.add_argument('-m', '--model_name', type=str,
choices=['tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large'],
default='base',
help='Specify the whisper ASR model to use. Options: tiny, base, small, medium, or large. '
'To see the most up to date list of models along with model size, memory footprint, and estimated '
'transcription speed check out this [link](https://github.com/openai/whisper#available-models-and-languages). '
'Note that the models ending in .en are trained only on English speech and will perform better on English '
'language. Note that the small, medium, and large models may be slow to transcribe and are only recommended '
'if you find the base model to be insufficient. Default: base.')
parser.add_argument('-k', '--key_combination', type=str, default='cmd_l+alt' if platform.system() == 'Darwin' else 'ctrl+alt',
help='Specify the key combination to toggle the app. Example: cmd_l+alt for macOS '
'ctrl+alt for other platforms. Default: cmd_r+alt (macOS) or ctrl+alt (others).')
parser.add_argument('-l', '--language', type=str, default=None,
help='Specify the two-letter language code (e.g., "en" for English) to improve recognition accuracy. '
'This can be especially helpful for smaller model sizes. To see the full list of supported languages, '
'check out the official list [here](https://github.com/openai/whisper/blob/main/whisper/tokenizer.py).')
parser.add_argument('-t', '--max_time', type=float, default=30,
help='Specify the maximum recording time in seconds. The app will automatically stop recording after this duration. '
'Default: 30 seconds.')
args = parser.parse_args()
if args.language is not None:
args.language = args.language.split(',')
if args.model_name.endswith('.en') and args.language is not None and any(lang != 'en' for lang in args.language):
raise ValueError('If using a model ending in .en, you cannot specify a language other than English.')
return args
if __name__ == "__main__":
args = parse_args()
w = Whisper.from_pretrained("base.en")
transcriber = SpeechTranscriber(w)
recorder = Recorder(transcriber)
app = StatusBarApp(recorder, args.language, args.max_time)
key_listener = GlobalKeyListener(app, args.key_combination)
listener = keyboard.Listener(on_press=key_listener.on_key_press, on_release=key_listener.on_key_release)
listener.start()
print("Running... ")
app.run()