diff --git a/README.md b/README.md index 937f044..7a19b3a 100644 --- a/README.md +++ b/README.md @@ -507,6 +507,11 @@ When you initialize the `AudioToTextRecorder` class, you have various options to - **beam_size_realtime** (int, default=3): The beam size to use for real-time transcription beam search decoding. #### Voice Activation Parameters +- **silero_repo_or_dir** (str, default="snakers4/silero-vad"): Specifies the repository or directory from which to load the Silero VAD model. You can provide a GitHub repository name for remote loading or a local directory path for loading a custom model. + +- **silero_source** (str, default="github"): Specifies the source to use for loading the Silero VAD model. Set to "github" to load from a remote repository, or "local" to load from local files. + +- **silero_model** (str, default="silero_vad"): Specifies the model name to use for Silero VAD. Use "silero_vad" for the default model, or specify a custom model name if needed. - **silero_sensitivity** (float, default=0.6): Sensitivity for Silero's voice activity detection ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.6. diff --git a/RealtimeSTT/audio_recorder.py b/RealtimeSTT/audio_recorder.py index 59461b5..96eed1b 100644 --- a/RealtimeSTT/audio_recorder.py +++ b/RealtimeSTT/audio_recorder.py @@ -69,6 +69,9 @@ INIT_MODEL_TRANSCRIPTION_REALTIME = "tiny" INIT_REALTIME_PROCESSING_PAUSE = 0.2 INIT_REALTIME_INITIAL_PAUSE = 0.2 +INIT_SILERO_REPO_OR_DIR = "snakers4/silero-vad" +INIT_SILERO_SOURCE = "github" +INIT_SILERO_MODEL = "silero_vad" INIT_SILERO_SENSITIVITY = 0.4 INIT_WEBRTC_SENSITIVITY = 3 INIT_POST_SPEECH_SILENCE_DURATION = 0.6 @@ -279,6 +282,9 @@ def __init__(self, realtime_batch_size: int = 16, # Voice activation parameters + silero_repo_or_dir: str = INIT_SILERO_REPO_OR_DIR, + silero_source: str = INIT_SILERO_SOURCE, + silero_model: str = INIT_SILERO_MODEL, silero_sensitivity: float = INIT_SILERO_SENSITIVITY, silero_use_onnx: bool = False, silero_deactivity_detection: bool = False, @@ -418,6 +424,12 @@ def __init__(self, slight delay compared to the regular real-time updates. - realtime_batch_size (int, default=16): Batch size for the real-time transcription model. + - silero_repo_or_dir (str, default="snakers4/silero-vad"): Specifies the repository or directory + from which to load the Silero VAD model. It can be github repo name or a local directory. + - silero_source (str, default="github"): Specifies the source to use for loading the Silero VAD model. + Typically "github" for remote loading, or "local" for local files. + - silero_model (str, default="silero_vad"): Specifies the model name to use for Silero VAD. + Usually "silero_vad" unless using a custom model. - silero_sensitivity (float, default=SILERO_SENSITIVITY): Sensitivity for the Silero Voice Activity Detection model ranging from 0 (least sensitive) to 1 (most sensitive). Default is 0.5. @@ -920,8 +932,9 @@ def __init__(self, # Setup voice activity detection model Silero VAD try: self.silero_vad_model, _ = torch.hub.load( - repo_or_dir="snakers4/silero-vad", - model="silero_vad", + repo_or_dir=silero_repo_or_dir, + source=silero_source, + model=silero_model, verbose=False, onnx=silero_use_onnx ) diff --git a/tests/offiline_vad.py b/tests/offiline_vad.py new file mode 100644 index 0000000..975f94e --- /dev/null +++ b/tests/offiline_vad.py @@ -0,0 +1,15 @@ +from RealtimeSTT import AudioToTextRecorder +import os + + +def test_offline_vad(): + local_dir = os.path.abspath("./silero-vad") + source = "local" + recorder = AudioToTextRecorder( + silero_repo_or_dir=local_dir, silero_source=source, silero_deactivity_detection=True) + assert recorder.silero_vad_model is not None, "Failed to load Silero VAD model offline" + print("Offline VAD test passed!") + + +if __name__ == '__main__': + test_offline_vad()