Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 67 additions & 27 deletions RealtimeSTT/audio_recorder.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@
import os
import re
import gc
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

# Set OpenMP runtime duplicate library handling to OK (Use only for development!)
os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
Expand Down Expand Up @@ -86,10 +88,10 @@
if platform.system() != 'Darwin':
INIT_HANDLE_BUFFER_OVERFLOW = True


class TranscriptionWorker:
def __init__(self, conn, stdout_pipe, model_path, download_root, compute_type, gpu_device_index, device,
ready_event, shutdown_event, interrupt_stop_event, beam_size, initial_prompt, suppress_tokens, batch_size):
ready_event, shutdown_event, interrupt_stop_event, beam_size, initial_prompt, suppress_tokens, batch_size,
task, word_timestamps, without_timestamps):
self.conn = conn
self.stdout_pipe = stdout_pipe
self.model_path = model_path
Expand All @@ -104,7 +106,10 @@ def __init__(self, conn, stdout_pipe, model_path, download_root, compute_type, g
self.initial_prompt = initial_prompt
self.suppress_tokens = suppress_tokens
self.batch_size = batch_size
self.queue = queue.Queue()
self.queue = queue.Queue()
self.task = task
self.without_timestamps = without_timestamps
self.word_timestamps = word_timestamps

def custom_print(self, *args, **kwargs):
message = ' '.join(map(str, args))
Expand All @@ -131,16 +136,18 @@ def run(self):

logging.info(f"Initializing faster_whisper main transcription model {self.model_path}")


try:
model = faster_whisper.WhisperModel(
model_size_or_path=self.model_path,
device=self.device,
device=self.device,###'cuda' if torch.cuda.is_available() else 'cpu',
compute_type=self.compute_type,
device_index=self.gpu_device_index,
download_root=self.download_root,
)
if self.batch_size > 0:
model = BatchedInferencePipeline(model=model)

except Exception as e:
logging.exception(f"Error initializing main faster_whisper transcription model: {e}")
raise
Expand All @@ -164,17 +171,29 @@ def run(self):
beam_size=self.beam_size,
initial_prompt=self.initial_prompt,
suppress_tokens=self.suppress_tokens,
batch_size=self.batch_size
batch_size=self.batch_size,
task=self.task,
without_timestamps=self.without_timestamps,
word_timestamps=self.word_timestamps
)
else:
segments, info = model.transcribe(
audio,
language=language if language else None,
beam_size=self.beam_size,
initial_prompt=self.initial_prompt,
suppress_tokens=self.suppress_tokens
suppress_tokens=self.suppress_tokens,
task=self.task,
without_timestamps=self.without_timestamps,
word_timestamps=self.word_timestamps
)

# # Print transcription and translation segments
#print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
# for segment in segments:
# print("\n[%.2fs - %.2fs] %s" % (segment.start, segment.end, segment.text))
# time.sleep(10)
#segments = segments[0]
transcription = " ".join(seg.text for seg in segments).strip()
logging.debug(f"Final text detected with main model: {transcription}")
self.conn.send(('success', (transcription, info)))
Expand Down Expand Up @@ -259,7 +278,7 @@ def __init__(self,
on_vad_detect_stop=None,

# Wake word parameters
wakeword_backend: str = "pvporcupine",
wakeword_backend: str = "pvporcupine", #"oww", "pvporcupine"
openwakeword_model_paths: str = None,
openwakeword_inference_framework: str = "onnx",
wake_words: str = "",
Expand All @@ -276,8 +295,8 @@ def __init__(self,
on_recorded_chunk=None,
debug_mode=False,
handle_buffer_overflow: bool = INIT_HANDLE_BUFFER_OVERFLOW,
beam_size: int = 5,
beam_size_realtime: int = 3,
beam_size: int = 5, #<---how many searching path to use.(https://en.wikipedia.org/wiki/Beam_search) (1=faster,lessContextPrecision ...5=lower,more context precision) (1-deep search in 1 branch, 5 -deep search in 5 branchs simultan.)
beam_size_realtime: int = 3, #<---how many searching path to use.(https://en.wikipedia.org/wiki/Beam_search) (1=faster,lessContextPrecision ...5=lower,more context precision) (1-deep search in 1 branch, 5 -deep search in 5 branchs simultan.)
buffer_size: int = BUFFER_SIZE,
sample_rate: int = SAMPLE_RATE,
initial_prompt: Optional[Union[str, Iterable[int]]] = None,
Expand All @@ -287,7 +306,11 @@ def __init__(self,
allowed_latency_limit: int = ALLOWED_LATENCY_LIMIT,
no_log_file: bool = False,
use_extended_logging: bool = False,
task: str = "transcribe",
without_timestamps: int = False,
word_timestamps: int = True,
):
print( "AudioToTextRecorder___INIT___batched_model" )
"""
Initializes an audio recorder and transcription
and wake word detection.
Expand Down Expand Up @@ -494,7 +517,9 @@ def __init__(self,
- use_extended_logging (bool, default=False): Writes extensive
log messages for the recording worker, that processes the audio
chunks.

- task "translate" or "transcribe"
- without_timestamps: int = False,
- word_timestamps: int = True,
Raises:
Exception: Errors related to initializing transcription
model, wake word detection, or audio recording.
Expand Down Expand Up @@ -595,7 +620,9 @@ def __init__(self,
self.print_transcription_time = print_transcription_time
self.early_transcription_on_silence = early_transcription_on_silence
self.use_extended_logging = use_extended_logging

self.task = task
self.without_timestamps = without_timestamps
self.word_timestamps = word_timestamps
# Initialize the logging configuration with the specified level
log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'

Expand Down Expand Up @@ -669,7 +696,10 @@ def __init__(self,
self.beam_size,
self.initial_prompt,
self.suppress_tokens,
self.batch_size
self.batch_size,
self.task,
self.without_timestamps,
self.word_timestamps ,
)
)

Expand Down Expand Up @@ -824,7 +854,6 @@ def __init__(self,
verbose=False,
onnx=silero_use_onnx
)

except Exception as e:
logging.exception(f"Error initializing Silero VAD "
f"voice activity detection engine: {e}"
Expand Down Expand Up @@ -2045,18 +2074,18 @@ def _realtime_worker(self):
and a callback
function is invoked with this text if specified.
"""
### better inside the loop
# try:

try:

logging.debug('Starting realtime worker')

# Return immediately if real-time transcription is not enabled
if not self.enable_realtime_transcription:
return
logging.debug('Starting realtime worker')

# Continue running as long as the main process is active
while self.is_running:
# Return immediately if real-time transcription is not enabled
if not self.enable_realtime_transcription:
return

# Continue running as long as the main process is active
while self.is_running:
try:
# Check if the recording is active
if self.is_recording:

Expand Down Expand Up @@ -2106,15 +2135,21 @@ def _realtime_worker(self):
beam_size=self.beam_size_realtime,
initial_prompt=self.initial_prompt,
suppress_tokens=self.suppress_tokens,
batch_size=self.realtime_batch_size
batch_size=self.realtime_batch_size,
task=self.task,
without_timestamps=self.without_timestamps,
word_timestamps=self.word_timestamps
)
else:
segments, info = self.realtime_model_type.transcribe(
audio_array,
language=self.language if self.language else None,
beam_size=self.beam_size_realtime,
initial_prompt=self.initial_prompt,
suppress_tokens=self.suppress_tokens
suppress_tokens=self.suppress_tokens,
task=self.task,
without_timestamps=self.without_timestamps,
word_timestamps=self.word_timestamps
)

self.detected_realtime_language = info.language if info.language_probability > 0 else None
Expand Down Expand Up @@ -2207,10 +2242,14 @@ def _realtime_worker(self):
# If not recording, sleep briefly before checking again
else:
time.sleep(TIME_SLEEP)
except Exception as e:
logging.error("")

except Exception as e:
logging.error(f"Unhandled exeption in _realtime_worker: {e}")
raise
# except Exception as e:
# logging.error(f"Unhandled exeption in _realtime_worker: {e}")
# raise



def _is_silero_speech(self, chunk):
"""
Expand Down Expand Up @@ -2555,3 +2594,4 @@ def __exit__(self, exc_type, exc_value, traceback):
exception, if any.
"""
self.shutdown()