KoljaB · TommasoRella-github · Dec 15, 2024 · Dec 15, 2024 · Dec 15, 2024
diff --git a/RealtimeSTT/audio_recorder.py b/RealtimeSTT/audio_recorder.py
@@ -57,6 +57,8 @@
 import os
 import re
 import gc
+os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
+os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
 
 # Set OpenMP runtime duplicate library handling to OK (Use only for development!)
 os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'
@@ -86,10 +88,10 @@
 if platform.system() != 'Darwin':
     INIT_HANDLE_BUFFER_OVERFLOW = True
 
-
 class TranscriptionWorker:
     def __init__(self, conn, stdout_pipe, model_path, download_root, compute_type, gpu_device_index, device,
-                 ready_event, shutdown_event, interrupt_stop_event, beam_size, initial_prompt, suppress_tokens, batch_size):
+                 ready_event, shutdown_event, interrupt_stop_event, beam_size, initial_prompt, suppress_tokens, batch_size,
+                 task,	word_timestamps, without_timestamps):
         self.conn = conn
         self.stdout_pipe = stdout_pipe
         self.model_path = model_path
@@ -104,7 +106,10 @@ def __init__(self, conn, stdout_pipe, model_path, download_root, compute_type, g
         self.initial_prompt = initial_prompt
         self.suppress_tokens = suppress_tokens
         self.batch_size = batch_size
-        self.queue = queue.Queue()
+        self.queue = queue.Queue()        
+        self.task = task
+        self.without_timestamps = without_timestamps		
+        self.word_timestamps = word_timestamps
 
     def custom_print(self, *args, **kwargs):
         message = ' '.join(map(str, args))
@@ -131,16 +136,18 @@ def run(self):
 
         logging.info(f"Initializing faster_whisper main transcription model {self.model_path}")
 
+
         try:
             model = faster_whisper.WhisperModel(
                 model_size_or_path=self.model_path,
-                device=self.device,
+                device=self.device,###'cuda' if torch.cuda.is_available() else 'cpu',
                 compute_type=self.compute_type,
                 device_index=self.gpu_device_index,
                 download_root=self.download_root,
             )
             if self.batch_size > 0:
                 model = BatchedInferencePipeline(model=model)
+
         except Exception as e:
             logging.exception(f"Error initializing main faster_whisper transcription model: {e}")
             raise
@@ -164,17 +171,29 @@ def run(self):
                                 beam_size=self.beam_size,
                                 initial_prompt=self.initial_prompt,
                                 suppress_tokens=self.suppress_tokens,
-                                batch_size=self.batch_size
+                                batch_size=self.batch_size,
+                                task=self.task,
+	                        without_timestamps=self.without_timestamps,
+	                        word_timestamps=self.word_timestamps                              
                             )
                         else:
                             segments, info = model.transcribe(
                                 audio,
                                 language=language if language else None,
                                 beam_size=self.beam_size,
                                 initial_prompt=self.initial_prompt,
-                                suppress_tokens=self.suppress_tokens
+                                suppress_tokens=self.suppress_tokens,
+                                task=self.task,
+	                        without_timestamps=self.without_timestamps,
+	                        word_timestamps=self.word_timestamps
                             )
 
+                        # # Print transcription and translation segments
+                        #print("Detected language '%s' with probability %f" % (info.language, info.language_probability))
+                        # for segment in segments:
+                        #     print("\n[%.2fs - %.2fs] %s" % (segment.start, segment.end, segment.text))
+                        # time.sleep(10)                            
+                        #segments = segments[0]
                         transcription = " ".join(seg.text for seg in segments).strip()
                         logging.debug(f"Final text detected with main model: {transcription}")
                         self.conn.send(('success', (transcription, info)))
@@ -259,7 +278,7 @@ def __init__(self,
                  on_vad_detect_stop=None,
 
                  # Wake word parameters
-                 wakeword_backend: str = "pvporcupine",
+                 wakeword_backend: str = "pvporcupine",   #"oww", "pvporcupine"
                  openwakeword_model_paths: str = None,
                  openwakeword_inference_framework: str = "onnx",
                  wake_words: str = "",
@@ -276,8 +295,8 @@ def __init__(self,
                  on_recorded_chunk=None,
                  debug_mode=False,
                  handle_buffer_overflow: bool = INIT_HANDLE_BUFFER_OVERFLOW,
-                 beam_size: int = 5,
-                 beam_size_realtime: int = 3,
+                 beam_size: int = 5, #<---how many searching path to use.(https://en.wikipedia.org/wiki/Beam_search) (1=faster,lessContextPrecision ...5=lower,more context precision)  (1-deep search in 1 branch, 5 -deep search in 5 branchs simultan.)
+                 beam_size_realtime: int = 3, #<---how many searching path to use.(https://en.wikipedia.org/wiki/Beam_search) (1=faster,lessContextPrecision ...5=lower,more context precision)  (1-deep search in 1 branch, 5 -deep search in 5 branchs simultan.)
                  buffer_size: int = BUFFER_SIZE,
                  sample_rate: int = SAMPLE_RATE,
                  initial_prompt: Optional[Union[str, Iterable[int]]] = None,
@@ -287,7 +306,11 @@ def __init__(self,
                  allowed_latency_limit: int = ALLOWED_LATENCY_LIMIT,
                  no_log_file: bool = False,
                  use_extended_logging: bool = False,
+                 task: str = "transcribe",
+                 without_timestamps: int = False,
+                 word_timestamps: int = True,
                  ):
+        print(  "AudioToTextRecorder___INIT___batched_model" )
         """
         Initializes an audio recorder and  transcription
         and wake word detection.
@@ -494,7 +517,9 @@ def __init__(self,
         - use_extended_logging (bool, default=False): Writes extensive
             log messages for the recording worker, that processes the audio
             chunks.
-
+        - task "translate" or "transcribe"
+        - without_timestamps: int = False,
+        - word_timestamps: int = True,
         Raises:
             Exception: Errors related to initializing transcription
             model, wake word detection, or audio recording.
@@ -595,7 +620,9 @@ def __init__(self,
         self.print_transcription_time = print_transcription_time
         self.early_transcription_on_silence = early_transcription_on_silence
         self.use_extended_logging = use_extended_logging
-
+        self.task = task
+        self.without_timestamps = without_timestamps
+        self.word_timestamps = word_timestamps
         # Initialize the logging configuration with the specified level
         log_format = 'RealTimeSTT: %(name)s - %(levelname)s - %(message)s'
 
@@ -669,7 +696,10 @@ def __init__(self,
                 self.beam_size,
                 self.initial_prompt,
                 self.suppress_tokens,
-                self.batch_size
+                self.batch_size,
+		self.task,
+                self.without_timestamps,
+                self.word_timestamps ,
             )
         )
 
@@ -824,7 +854,6 @@ def __init__(self,
                 verbose=False,
                 onnx=silero_use_onnx
             )
-
         except Exception as e:
             logging.exception(f"Error initializing Silero VAD "
                               f"voice activity detection engine: {e}"
@@ -2045,18 +2074,18 @@ def _realtime_worker(self):
           and a callback
         function is invoked with this text if specified.
         """
+        ### better inside the loop
+        # try:
 
-        try:
-
-            logging.debug('Starting realtime worker')
-
-            # Return immediately if real-time transcription is not enabled
-            if not self.enable_realtime_transcription:
-                return
+        logging.debug('Starting realtime worker')
 
-            # Continue running as long as the main process is active
-            while self.is_running:
+        # Return immediately if real-time transcription is not enabled
+        if not self.enable_realtime_transcription:
+            return
 
+        # Continue running as long as the main process is active
+        while self.is_running:
+            try:
                 # Check if the recording is active
                 if self.is_recording:
 
@@ -2106,15 +2135,21 @@ def _realtime_worker(self):
                                 beam_size=self.beam_size_realtime,
                                 initial_prompt=self.initial_prompt,
                                 suppress_tokens=self.suppress_tokens,
-                                batch_size=self.realtime_batch_size
+                                batch_size=self.realtime_batch_size,
+								task=self.task,
+								without_timestamps=self.without_timestamps,
+								word_timestamps=self.word_timestamps                                
                             )
                         else:
                             segments, info = self.realtime_model_type.transcribe(
                                 audio_array,
                                 language=self.language if self.language else None,
                                 beam_size=self.beam_size_realtime,
                                 initial_prompt=self.initial_prompt,
-                                suppress_tokens=self.suppress_tokens
+                                suppress_tokens=self.suppress_tokens,
+								task=self.task,
+								without_timestamps=self.without_timestamps,
+								word_timestamps=self.word_timestamps
                             )
 
                         self.detected_realtime_language = info.language if info.language_probability > 0 else None
@@ -2207,10 +2242,14 @@ def _realtime_worker(self):
                 # If not recording, sleep briefly before checking again
                 else:
                     time.sleep(TIME_SLEEP)
+            except Exception as e:
+                logging.error("")
 
-        except Exception as e:
-            logging.error(f"Unhandled exeption in _realtime_worker: {e}")
-            raise
+        # except Exception as e:
+        #     logging.error(f"Unhandled exeption in _realtime_worker: {e}")
+        #     raise
+
+
 
     def _is_silero_speech(self, chunk):
         """
@@ -2555,3 +2594,4 @@ def __exit__(self, exc_type, exc_value, traceback):
               exception, if any.
         """
         self.shutdown()
+