Skip to content

Commit

Permalink
Fix for silero vad v5. (#1065)
Browse files Browse the repository at this point in the history
The network input is 64 + 512 samples instead of 512 samples for 16kHz.
  • Loading branch information
csukuangfj authored Jun 30, 2024
1 parent 61c7eb3 commit 6cb0181
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 12 deletions.
17 changes: 9 additions & 8 deletions sherpa-onnx/csrc/silero-vad-model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,8 @@ class SileroVadModel::Impl {
}

bool IsSpeech(const float *samples, int32_t n) {
if (n != config_.silero_vad.window_size) {
SHERPA_ONNX_LOGE("n: %d != window_size: %d", n,
config_.silero_vad.window_size);
if (n != WindowSize()) {
SHERPA_ONNX_LOGE("n: %d != window_size: %d", n, WindowSize());
exit(-1);
}

Expand Down Expand Up @@ -146,9 +145,11 @@ class SileroVadModel::Impl {
return false;
}

int32_t WindowSize() const { return config_.silero_vad.window_size; }
int32_t WindowShift() const { return config_.silero_vad.window_size; }

int32_t WindowShift() const { return WindowSize() - window_shift_; }
int32_t WindowSize() const {
return config_.silero_vad.window_size + window_overlap_;
}

int32_t MinSilenceDurationSamples() const { return min_silence_samples_; }

Expand Down Expand Up @@ -177,9 +178,9 @@ class SileroVadModel::Impl {

// 64 for 16kHz
// 32 for 8kHz
window_shift_ = 64;
window_overlap_ = 64;

if (WindowSize() != 512) {
if (config_.silero_vad.window_size != 512) {
SHERPA_ONNX_LOGE(
"For silero_vad v5, we require window_size to be 512 for 16kHz");
exit(-1);
Expand Down Expand Up @@ -423,7 +424,7 @@ class SileroVadModel::Impl {
int32_t temp_start_ = 0;
int32_t temp_end_ = 0;

int32_t window_shift_ = 0;
int32_t window_overlap_ = 0;

bool is_v5_ = false;
};
Expand Down
7 changes: 4 additions & 3 deletions sherpa-onnx/csrc/silero-vad-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@ class SileroVadModel : public VadModel {
*/
bool IsSpeech(const float *samples, int32_t n) override;

// For silero vad V4, it is WindowShift().
// For silero vad V5, it is WindowShift()+64 for 16kHz and
// WindowShift()+32 for 8kHz
int32_t WindowSize() const override;

// For silero vad V4, it is WindowSize().
// For silero vad V5, it is WindowSize()-64 for 16kHz and
// WindowSize()-32 for 8kHz
// 512
int32_t WindowShift() const override;

int32_t MinSilenceDurationSamples() const override;
Expand Down
6 changes: 5 additions & 1 deletion sherpa-onnx/csrc/voice-activity-detector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,17 @@ class VoiceActivityDetector::Impl {
// an extra buffer here
last_.insert(last_.end(), samples, samples + n);

if (last_.size() < window_size) {
return;
}

// Note: For v4, window_shift == window_size
int32_t k =
(static_cast<int32_t>(last_.size()) - window_size) / window_shift + 1;
const float *p = last_.data();
bool is_speech = false;

for (int32_t i = 0; i != k; ++i, p += window_shift) {
for (int32_t i = 0; i < k; ++i, p += window_shift) {
buffer_.Push(p, window_shift);
// NOTE(fangjun): Please don't use a very large n.
bool this_window_is_speech = model_->IsSpeech(p, window_size);
Expand Down

0 comments on commit 6cb0181

Please sign in to comment.