diff --git a/python-api-examples/vad-remove-non-speech-segments-from-file.py b/python-api-examples/vad-remove-non-speech-segments-from-file.py new file mode 100755 index 000000000..34d0f955b --- /dev/null +++ b/python-api-examples/vad-remove-non-speech-segments-from-file.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 + +""" +This file shows how to remove non-speech segments +and merge all speech segments into a large segment +and save it to a file. + +Usage + +python3 ./vad-remove-non-speech-segments-from-file.py \ + --silero-vad-model silero_vad.onnx \ + input.wav \ + output.wav + +Please visit +https://github.com/snakers4/silero-vad/blob/master/files/silero_vad.onnx +to download silero_vad.onnx + +For instance, + +wget https://github.com/snakers4/silero-vad/raw/master/files/silero_vad.onnx +""" + +import argparse +from pathlib import Path +from typing import Tuple + +import numpy as np +import sherpa_onnx +import soundfile as sf + + +def assert_file_exists(filename: str): + assert Path(filename).is_file(), ( + f"{filename} does not exist!\n" + "Please refer to " + "https://k2-fsa.github.io/sherpa/onnx/pretrained_models/index.html to download it" + ) + + +def get_args(): + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + + parser.add_argument( + "--silero-vad-model", + type=str, + required=True, + help="Path to silero_vad.onnx", + ) + + parser.add_argument( + "input", + type=str, + help="Path to input.wav", + ) + + parser.add_argument( + "output", + type=str, + help="Path to output.wav", + ) + + return parser.parse_args() + + +def load_audio(filename: str) -> Tuple[np.ndarray, int]: + data, sample_rate = sf.read( + filename, + always_2d=True, + dtype="float32", + ) + data = data[:, 0] # use only the first channel + samples = np.ascontiguousarray(data) + return samples, sample_rate + + +def main(): + args = get_args() + assert_file_exists(args.silero_vad_model) + assert_file_exists(args.input) + + samples, sample_rate = load_audio(args.input) + if sample_rate != 16000: + import librosa + + samples = librosa.resample(samples, orig_sr=sample_rate, target_sr=16000) + sample_rate = 16000 + + config = sherpa_onnx.VadModelConfig() + config.silero_vad.model = args.silero_vad_model + config.sample_rate = sample_rate + + window_size = config.silero_vad.window_size + + vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=30) + + speech_samples = [] + while len(samples) > window_size: + vad.accept_waveform(samples[:window_size]) + samples = samples[window_size:] + + while not vad.empty(): + speech_samples.extend(vad.front.samples) + vad.pop() + + speech_samples = np.array(speech_samples, dtype=np.float32) + + sf.write(args.output, speech_samples, samplerate=sample_rate) + + print(f"Saved to {args.output}") + + +if __name__ == "__main__": + main() diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index f6160deb1..eaf782b92 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -79,6 +79,10 @@ SherpaOnnxOnlineRecognizer *CreateOnlineRecognizer( SHERPA_ONNX_OR(config->model_config.model_type, ""); recognizer_config.model_config.debug = SHERPA_ONNX_OR(config->model_config.debug, 0); + recognizer_config.model_config.modeling_unit = + SHERPA_ONNX_OR(config->model_config.modeling_unit, "cjkchar"); + recognizer_config.model_config.bpe_vocab = + SHERPA_ONNX_OR(config->model_config.bpe_vocab, ""); recognizer_config.decoding_method = SHERPA_ONNX_OR(config->decoding_method, "greedy_search"); @@ -357,6 +361,10 @@ SherpaOnnxOfflineRecognizer *CreateOfflineRecognizer( SHERPA_ONNX_OR(config->model_config.provider, "cpu"); recognizer_config.model_config.model_type = SHERPA_ONNX_OR(config->model_config.model_type, ""); + recognizer_config.model_config.modeling_unit = + SHERPA_ONNX_OR(config->model_config.modeling_unit, "cjkchar"); + recognizer_config.model_config.bpe_vocab = + SHERPA_ONNX_OR(config->model_config.bpe_vocab, ""); recognizer_config.lm_config.model = SHERPA_ONNX_OR(config->lm_config.model, ""); diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 1b1b56330..bd9b6a4d4 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -82,6 +82,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOnlineModelConfig { const char *provider; int32_t debug; // true to print debug information of the model const char *model_type; + // Valid values: + // - cjkchar + // - bpe + // - cjkchar+bpe + const char *modeling_unit; + const char *bpe_vocab; } SherpaOnnxOnlineModelConfig; /// It expects 16 kHz 16-bit single channel wave format. @@ -383,6 +389,12 @@ SHERPA_ONNX_API typedef struct SherpaOnnxOfflineModelConfig { int32_t debug; const char *provider; const char *model_type; + // Valid values: + // - cjkchar + // - bpe + // - cjkchar+bpe + const char *modeling_unit; + const char *bpe_vocab; } SherpaOnnxOfflineModelConfig; SHERPA_ONNX_API typedef struct SherpaOnnxOfflineRecognizerConfig {