Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix for silero vad v5. #1065

Merged
merged 1 commit into from
Jun 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions sherpa-onnx/csrc/silero-vad-model.cc
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,8 @@ class SileroVadModel::Impl {
}

bool IsSpeech(const float *samples, int32_t n) {
if (n != config_.silero_vad.window_size) {
SHERPA_ONNX_LOGE("n: %d != window_size: %d", n,
config_.silero_vad.window_size);
if (n != WindowSize()) {
SHERPA_ONNX_LOGE("n: %d != window_size: %d", n, WindowSize());
exit(-1);
}

Expand Down Expand Up @@ -146,9 +145,11 @@ class SileroVadModel::Impl {
return false;
}

int32_t WindowSize() const { return config_.silero_vad.window_size; }
int32_t WindowShift() const { return config_.silero_vad.window_size; }

int32_t WindowShift() const { return WindowSize() - window_shift_; }
int32_t WindowSize() const {
return config_.silero_vad.window_size + window_overlap_;
}

int32_t MinSilenceDurationSamples() const { return min_silence_samples_; }

Expand Down Expand Up @@ -177,9 +178,9 @@ class SileroVadModel::Impl {

// 64 for 16kHz
// 32 for 8kHz
window_shift_ = 64;
window_overlap_ = 64;

if (WindowSize() != 512) {
if (config_.silero_vad.window_size != 512) {
SHERPA_ONNX_LOGE(
"For silero_vad v5, we require window_size to be 512 for 16kHz");
exit(-1);
Expand Down Expand Up @@ -423,7 +424,7 @@ class SileroVadModel::Impl {
int32_t temp_start_ = 0;
int32_t temp_end_ = 0;

int32_t window_shift_ = 0;
int32_t window_overlap_ = 0;

bool is_v5_ = false;
};
Expand Down
7 changes: 4 additions & 3 deletions sherpa-onnx/csrc/silero-vad-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,12 @@ class SileroVadModel : public VadModel {
*/
bool IsSpeech(const float *samples, int32_t n) override;

// For silero vad V4, it is WindowShift().
// For silero vad V5, it is WindowShift()+64 for 16kHz and
// WindowShift()+32 for 8kHz
int32_t WindowSize() const override;

// For silero vad V4, it is WindowSize().
// For silero vad V5, it is WindowSize()-64 for 16kHz and
// WindowSize()-32 for 8kHz
// 512
int32_t WindowShift() const override;

int32_t MinSilenceDurationSamples() const override;
Expand Down
6 changes: 5 additions & 1 deletion sherpa-onnx/csrc/voice-activity-detector.cc
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,17 @@ class VoiceActivityDetector::Impl {
// an extra buffer here
last_.insert(last_.end(), samples, samples + n);

if (last_.size() < window_size) {
return;
}

// Note: For v4, window_shift == window_size
int32_t k =
(static_cast<int32_t>(last_.size()) - window_size) / window_shift + 1;
const float *p = last_.data();
bool is_speech = false;

for (int32_t i = 0; i != k; ++i, p += window_shift) {
for (int32_t i = 0; i < k; ++i, p += window_shift) {
buffer_.Push(p, window_shift);
// NOTE(fangjun): Please don't use a very large n.
bool this_window_is_speech = model_->IsSpeech(p, window_size);
Expand Down
Loading