From 17dc4d75b56060d61d0e8dab36fed1a8695ff612 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 30 Jun 2024 08:53:51 +0800 Subject: [PATCH] Fix for silero vad v5. The network input is 64 + 512 samples instead of 512 samples. --- sherpa-onnx/csrc/silero-vad-model.cc | 17 +++++++++-------- sherpa-onnx/csrc/silero-vad-model.h | 7 ++++--- sherpa-onnx/csrc/voice-activity-detector.cc | 6 +++++- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/sherpa-onnx/csrc/silero-vad-model.cc b/sherpa-onnx/csrc/silero-vad-model.cc index 2340d3671..66841d56d 100644 --- a/sherpa-onnx/csrc/silero-vad-model.cc +++ b/sherpa-onnx/csrc/silero-vad-model.cc @@ -74,9 +74,8 @@ class SileroVadModel::Impl { } bool IsSpeech(const float *samples, int32_t n) { - if (n != config_.silero_vad.window_size) { - SHERPA_ONNX_LOGE("n: %d != window_size: %d", n, - config_.silero_vad.window_size); + if (n != WindowSize()) { + SHERPA_ONNX_LOGE("n: %d != window_size: %d", n, WindowSize()); exit(-1); } @@ -146,9 +145,11 @@ class SileroVadModel::Impl { return false; } - int32_t WindowSize() const { return config_.silero_vad.window_size; } + int32_t WindowShift() const { return config_.silero_vad.window_size; } - int32_t WindowShift() const { return WindowSize() - window_shift_; } + int32_t WindowSize() const { + return config_.silero_vad.window_size + window_overlap_; + } int32_t MinSilenceDurationSamples() const { return min_silence_samples_; } @@ -177,9 +178,9 @@ class SileroVadModel::Impl { // 64 for 16kHz // 32 for 8kHz - window_shift_ = 64; + window_overlap_ = 64; - if (WindowSize() != 512) { + if (config_.silero_vad.window_size != 512) { SHERPA_ONNX_LOGE( "For silero_vad v5, we require window_size to be 512 for 16kHz"); exit(-1); @@ -423,7 +424,7 @@ class SileroVadModel::Impl { int32_t temp_start_ = 0; int32_t temp_end_ = 0; - int32_t window_shift_ = 0; + int32_t window_overlap_ = 0; bool is_v5_ = false; }; diff --git a/sherpa-onnx/csrc/silero-vad-model.h b/sherpa-onnx/csrc/silero-vad-model.h index a56775ce7..169cb7244 100644 --- a/sherpa-onnx/csrc/silero-vad-model.h +++ b/sherpa-onnx/csrc/silero-vad-model.h @@ -37,11 +37,12 @@ class SileroVadModel : public VadModel { */ bool IsSpeech(const float *samples, int32_t n) override; + // For silero vad V4, it is WindowShift(). + // For silero vad V5, it is WindowShift()+64 for 16kHz and + // WindowShift()+32 for 8kHz int32_t WindowSize() const override; - // For silero vad V4, it is WindowSize(). - // For silero vad V5, it is WindowSize()-64 for 16kHz and - // WindowSize()-32 for 8kHz + // 512 int32_t WindowShift() const override; int32_t MinSilenceDurationSamples() const override; diff --git a/sherpa-onnx/csrc/voice-activity-detector.cc b/sherpa-onnx/csrc/voice-activity-detector.cc index 199e54246..0f80f9cb5 100644 --- a/sherpa-onnx/csrc/voice-activity-detector.cc +++ b/sherpa-onnx/csrc/voice-activity-detector.cc @@ -44,13 +44,17 @@ class VoiceActivityDetector::Impl { // an extra buffer here last_.insert(last_.end(), samples, samples + n); + if (last_.size() < window_size) { + return; + } + // Note: For v4, window_shift == window_size int32_t k = (static_cast(last_.size()) - window_size) / window_shift + 1; const float *p = last_.data(); bool is_speech = false; - for (int32_t i = 0; i != k; ++i, p += window_shift) { + for (int32_t i = 0; i < k; ++i, p += window_shift) { buffer_.Push(p, window_shift); // NOTE(fangjun): Please don't use a very large n. bool this_window_is_speech = model_->IsSpeech(p, window_size);