diff --git a/speech_recognition/__init__.py b/speech_recognition/__init__.py
index 253ab0fe..25530716 100644
--- a/speech_recognition/__init__.py
+++ b/speech_recognition/__init__.py
@@ -19,6 +19,7 @@
import tempfile
import threading
import time
+from typing import List, Optional
import uuid
import wave
from urllib.error import HTTPError, URLError
@@ -439,7 +440,77 @@ def snowboy_wait_for_hot_word(self, snowboy_location, snowboy_hot_word_files, so
return b"".join(frames), elapsed_time
- def listen(self, source, timeout=None, phrase_time_limit=None, snowboy_configuration=None, stream=False):
+ def porcupine_wait_for_hot_word(self, source,
+ access_key: str,
+ library_path: Optional[str] = None,
+ model_path: Optional[str] = None,
+ keyword_paths: Optional[List[str]] = None,
+ keywords: Optional[List[str]] = None,
+ sensitivities: Optional[List[float]] = None,
+ timeout=None):
+ import pvporcupine
+ import struct
+
+ porcupine = pvporcupine.create(access_key=access_key,
+ library_path=library_path,
+ model_path=model_path,
+ keyword_paths=keyword_paths,
+ keywords=keywords,
+ sensitivities=sensitivities
+ )
+ try:
+ porcupine_sample_rate = porcupine.sample_rate
+
+ elapsed_time = 0
+ chunk_size = min(source.CHUNK, porcupine.frame_length)
+ seconds_per_buffer = float(chunk_size) / source.SAMPLE_RATE
+ resampling_state = None
+
+ # buffers capable of holding 5 seconds of original audio
+ five_seconds_buffer_count = int(math.ceil(5 / seconds_per_buffer))
+ # buffers capable of holding 0.5 seconds of resampled audio
+ half_second_buffer_count = int(math.ceil(0.5 / seconds_per_buffer))
+
+ frames = collections.deque(maxlen=five_seconds_buffer_count)
+ resampled_frames = collections.deque(maxlen=half_second_buffer_count)
+
+ bytes_per_sample = 2
+
+ check_interval = 0.05
+ last_check = time.time()
+
+ found = False
+
+ while not found:
+ elapsed_time += seconds_per_buffer
+ if timeout and elapsed_time > timeout:
+ raise WaitTimeoutError("listening timed out while waiting for hotword to be said")
+
+ buffer = source.stream.read(chunk_size)
+ if len(buffer) == 0: break # reached end of the stream
+ frames.append(buffer)
+
+ # resample audio to the required sample rate
+ resampled_buffer, resampling_state = audioop.ratecv(buffer, source.SAMPLE_WIDTH, 1, source.SAMPLE_RATE, porcupine_sample_rate, resampling_state)
+ resampled_frames.append(resampled_buffer)
+
+ while (sum(len(x) for x in resampled_frames) >= porcupine.frame_length * bytes_per_sample):
+ buffer = b"".join(resampled_frames)
+ data = buffer[:porcupine.frame_length * bytes_per_sample]
+ buffer = buffer[porcupine.frame_length * bytes_per_sample:]
+ resampled_frames.clear()
+ resampled_frames.append(buffer)
+ pcm = struct.unpack("h" * porcupine.frame_length, data)
+ result = porcupine.process(pcm)
+ if result >= 0:
+ found = True
+ break
+
+ return b"".join(frames), elapsed_time
+ finally:
+ porcupine.delete()
+
+ def listen(self, source, timeout=None, phrase_time_limit=None, snowboy_configuration=None, porcupine_configuration=None, stream=False):
"""
Records a single phrase from ``source`` (an ``AudioSource`` instance) into an ``AudioData`` instance, which it returns.
@@ -453,15 +524,17 @@ def listen(self, source, timeout=None, phrase_time_limit=None, snowboy_configura
The ``snowboy_configuration`` parameter allows integration with `Snowboy `__, an offline, high-accuracy, power-efficient hotword recognition engine. When used, this function will pause until Snowboy detects a hotword, after which it will unpause. This parameter should either be ``None`` to turn off Snowboy support, or a tuple of the form ``(SNOWBOY_LOCATION, LIST_OF_HOT_WORD_FILES)``, where ``SNOWBOY_LOCATION`` is the path to the Snowboy root directory, and ``LIST_OF_HOT_WORD_FILES`` is a list of paths to Snowboy hotword configuration files (`*.pmdl` or `*.umdl` format).
+ The ``porcupine_configuration`` parameter allows integration with `Porcupine `__, an offline, high-accuracy, power-efficient hotword recognition engine. When used, this function will pause until Porcupine detects a hotword, after which it will unpause. This parameter should either be ``None`` to turn off Porcupine support, or a dictionary of the form ``{access_key: str, library_path: Optional[str], model_path: Optional[str], keyword_paths: Optional[List[str]], keywords: Optional[List[str]], sensitivities: Optional[List[float]]}``, see https://picovoice.ai/docs/api/porcupine-python/ for the use of these parameters.
+
This operation will always complete within ``timeout + phrase_timeout`` seconds if both are numbers, either by returning the audio data, or by raising a ``speech_recognition.WaitTimeoutError`` exception.
"""
- result = self._listen(source, timeout, phrase_time_limit, snowboy_configuration, stream)
+ result = self._listen(source, timeout, phrase_time_limit, snowboy_configuration, porcupine_configuration, stream)
if not stream:
for a in result:
return a
return result
- def _listen(self, source, timeout=None, phrase_time_limit=None, snowboy_configuration=None, stream=False):
+ def _listen(self, source, timeout=None, phrase_time_limit=None, snowboy_configuration=None, porcupine_configuration=None, stream=False):
assert isinstance(source, AudioSource), "Source must be an audio source"
assert source.stream is not None, "Audio source must be entered before listening, see documentation for ``AudioSource``; are you using ``source`` outside of a ``with`` statement?"
assert self.pause_threshold >= self.non_speaking_duration >= 0
@@ -481,7 +554,7 @@ def _listen(self, source, timeout=None, phrase_time_limit=None, snowboy_configur
while True:
frames = collections.deque()
- if snowboy_configuration is None:
+ if snowboy_configuration is None and porcupine_configuration is None:
# store audio input until the phrase starts
while True:
# handle waiting too long for phrase by raising an exception
@@ -504,13 +577,18 @@ def _listen(self, source, timeout=None, phrase_time_limit=None, snowboy_configur
damping = self.dynamic_energy_adjustment_damping ** seconds_per_buffer # account for different chunk sizes and rates
target_energy = energy * self.dynamic_energy_ratio
self.energy_threshold = self.energy_threshold * damping + target_energy * (1 - damping)
- else:
+ elif snowboy_configuration is not None:
# read audio input until the hotword is said
snowboy_location, snowboy_hot_word_files = snowboy_configuration
buffer, delta_time = self.snowboy_wait_for_hot_word(snowboy_location, snowboy_hot_word_files, source, timeout)
elapsed_time += delta_time
if len(buffer) == 0: break # reached end of the stream
frames.append(buffer)
+ else: # porcupine mode
+ buffer, delta_time = self.porcupine_wait_for_hot_word(source=source, **porcupine_configuration)
+ elapsed_time += delta_time
+ if len(buffer) == 0: break # reached end of the stream
+ frames.append(buffer)
# read audio input until the phrase ends
pause_count, phrase_count = 0, 0