Add a VAD Python example to remove silences from a file. (#963)

k2-fsa · Jun 3, 2024 · b31b9f3 · b31b9f3
1 parent 9edb78e
commit b31b9f3
Showing 1 changed file with 116 additions and 0 deletions.
diff --git a/python-api-examples/vad-remove-non-speech-segments-from-file.py b/python-api-examples/vad-remove-non-speech-segments-from-file.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+
+"""
+This file shows how to remove non-speech segments
+and merge all speech segments into a large segment
+and save it to a file.
+
+Usage
+
+python3 ./vad-remove-non-speech-segments-from-file.py \
+        --silero-vad-model silero_vad.onnx \
+        input.wav \
+        output.wav
+
+Please visit
+https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx
+to download silero_vad.onnx
+
+For instance,
+
+wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx
+"""
+
+import argparse
+from pathlib import Path
+from typing import Tuple
+
+import numpy as np
+import sherpa_onnx
+import soundfile as sf
+
+
+def assert_file_exists(filename: str):
+    assert Path(filename).is_file(), (
+        f"{filename} does not exist!\n"
+        "Please refer to "
+        "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it"
+    )
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+
+    parser.add_argument(
+        "--silero-vad-model",
+        type=str,
+        required=True,
+        help="Path to silero_vad.onnx",
+    )
+
+    parser.add_argument(
+        "input",
+        type=str,
+        help="Path to input.wav",
+    )
+
+    parser.add_argument(
+        "output",
+        type=str,
+        help="Path to output.wav",
+    )
+
+    return parser.parse_args()
+
+
+def load_audio(filename: str) -> Tuple[np.ndarray, int]:
+    data, sample_rate = sf.read(
+        filename,
+        always_2d=True,
+        dtype="float32",
+    )
+    data = data[:, 0]  # use only the first channel
+    samples = np.ascontiguousarray(data)
+    return samples, sample_rate
+
+
+def main():
+    args = get_args()
+    assert_file_exists(args.silero_vad_model)
+    assert_file_exists(args.input)
+
+    samples, sample_rate = load_audio(args.input)
+    if sample_rate != 16000:
+        import librosa
+
+        samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000)
+        sample_rate = 16000
+
+    config = sherpa_onnx.VadModelConfig()
+    config.silero_vad.model = args.silero_vad_model
+    config.sample_rate = sample_rate
+
+    window_size = config.silero_vad.window_size
+
+    vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30)
+
+    speech_samples = []
+    while len(samples) > window_size:
+        vad.accept_waveform(samples[:window_size])
+        samples = samples[window_size:]
+
+        while not vad.empty():
+            speech_samples.extend(vad.front.samples)
+            vad.pop()
+
+    speech_samples = np.array(speech_samples, dtype=np.float32)
+
+    sf.write(args.output, speech_samples, samplerate=sample_rate)
+
+    print(f"Saved to {args.output}")
+
+
+if __name__ == "__main__":
+    main()