Skip to content

Commit e7ffcbd

Browse files
authored
Add APIs about max speech duration in VAD for various programming languages (#1349)
1 parent 1423ddb commit e7ffcbd

File tree

31 files changed

+88
-9
lines changed

31 files changed

+88
-9
lines changed

.github/workflows/dot-net.yaml

+2
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,8 @@ jobs:
9393
git clone https://huggingface.co/csukuangfj/sherpa-onnx-libs huggingface
9494
9595
cd huggingface
96+
git fetch
97+
git pull
9698
mkdir -p windows-for-dotnet
9799
98100
cp -v ../sherpa-onnx-*.tar.bz2 ./windows-for-dotnet

dart-api-examples/vad-with-non-streaming-asr/bin/paraformer.dart

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ void main(List<String> arguments) async {
3232
model: sileroVad,
3333
minSilenceDuration: 0.25,
3434
minSpeechDuration: 0.5,
35+
maxSpeechDuration: 5.0,
3536
);
3637

3738
final vadConfig = sherpa_onnx.VadModelConfig(

dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice-2.dart

+1
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ void main(List<String> arguments) async {
3838
model: sileroVad,
3939
minSilenceDuration: 0.25,
4040
minSpeechDuration: 0.5,
41+
maxSpeechDuration: 5.0,
4142
);
4243

4344
final vadConfig = sherpa_onnx.VadModelConfig(

dart-api-examples/vad-with-non-streaming-asr/bin/sense-voice.dart

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ void main(List<String> arguments) async {
3737
model: sileroVad,
3838
minSilenceDuration: 0.25,
3939
minSpeechDuration: 0.5,
40+
maxSpeechDuration: 5.0,
4041
);
4142

4243
final vadConfig = sherpa_onnx.VadModelConfig(

dart-api-examples/vad-with-non-streaming-asr/bin/telespeech-ctc.dart

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ void main(List<String> arguments) async {
3333
model: sileroVad,
3434
minSilenceDuration: 0.25,
3535
minSpeechDuration: 0.5,
36+
maxSpeechDuration: 5.0,
3637
);
3738

3839
final vadConfig = sherpa_onnx.VadModelConfig(

dart-api-examples/vad-with-non-streaming-asr/bin/whisper.dart

+1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ void main(List<String> arguments) async {
3434
model: sileroVad,
3535
minSilenceDuration: 0.25,
3636
minSpeechDuration: 0.5,
37+
maxSpeechDuration: 5.0,
3738
);
3839

3940
final vadConfig = sherpa_onnx.VadModelConfig(

dart-api-examples/vad-with-non-streaming-asr/bin/zipformer-transducer.dart

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ void main(List<String> arguments) async {
3737
model: sileroVad,
3838
minSilenceDuration: 0.25,
3939
minSpeechDuration: 0.5,
40+
maxSpeechDuration: 5.0,
4041
);
4142

4243
final vadConfig = sherpa_onnx.VadModelConfig(

flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart

+3
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,9 @@ final class SherpaOnnxSileroVadModelConfig extends Struct {
301301

302302
@Int32()
303303
external int windowSize;
304+
305+
@Float()
306+
external double maxSpeechDuration;
304307
}
305308

306309
final class SherpaOnnxVadModelConfig extends Struct {

flutter/sherpa_onnx/lib/src/vad.dart

+5-2
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,20 @@ class SileroVadModelConfig {
1111
this.threshold = 0.5,
1212
this.minSilenceDuration = 0.5,
1313
this.minSpeechDuration = 0.25,
14-
this.windowSize = 512});
14+
this.windowSize = 512,
15+
this.maxSpeechDuration = 5.0});
1516

1617
@override
1718
String toString() {
18-
return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize)';
19+
return 'SileroVadModelConfig(model: $model, threshold: $threshold, minSilenceDuration: $minSilenceDuration, minSpeechDuration: $minSpeechDuration, windowSize: $windowSize, maxSpeechDuration: $maxSpeechDuration)';
1920
}
2021

2122
final String model;
2223
final double threshold;
2324
final double minSilenceDuration;
2425
final double minSpeechDuration;
2526
final int windowSize;
27+
final double maxSpeechDuration;
2628
}
2729

2830
class VadModelConfig {
@@ -127,6 +129,7 @@ class VoiceActivityDetector {
127129
c.ref.sileroVad.minSilenceDuration = config.sileroVad.minSilenceDuration;
128130
c.ref.sileroVad.minSpeechDuration = config.sileroVad.minSpeechDuration;
129131
c.ref.sileroVad.windowSize = config.sileroVad.windowSize;
132+
c.ref.sileroVad.maxSpeechDuration = config.sileroVad.maxSpeechDuration;
130133

131134
c.ref.sampleRate = config.sampleRate;
132135
c.ref.numThreads = config.numThreads;

go-api-examples/vad-asr-paraformer/main.go

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ func main() {
2222
config.SileroVad.MinSilenceDuration = 0.5
2323
config.SileroVad.MinSpeechDuration = 0.25
2424
config.SileroVad.WindowSize = 512
25+
config.SileroVad.MaxSpeechDuration = 5.0
2526
config.SampleRate = 16000
2627
config.NumThreads = 1
2728
config.Provider = "cpu"

go-api-examples/vad-asr-whisper/main.go

+1
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ func main() {
2222
config.SileroVad.MinSilenceDuration = 0.5
2323
config.SileroVad.MinSpeechDuration = 0.25
2424
config.SileroVad.WindowSize = 512
25+
config.SileroVad.MaxSpeechDuration = 5.0
2526
config.SampleRate = 16000
2627
config.NumThreads = 1
2728
config.Provider = "cpu"

java-api-examples/VadNonStreamingParaformer.java

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ public static Vad createVad() {
1818
.setMinSilenceDuration(0.25f)
1919
.setMinSpeechDuration(0.5f)
2020
.setWindowSize(512)
21+
.setMaxSpeechDuration(5.0f)
2122
.build();
2223

2324
VadModelConfig config =

java-api-examples/VadNonStreamingSenseVoice.java

+1
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ public static Vad createVad() {
1818
.setMinSilenceDuration(0.25f)
1919
.setMinSpeechDuration(0.5f)
2020
.setWindowSize(512)
21+
.setMaxSpeechDuration(5.0f)
2122
.build();
2223

2324
VadModelConfig config =

java-api-examples/VadRemoveSilence.java

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ public static void main(String[] args) {
1919
.setMinSilenceDuration(0.25f)
2020
.setMinSpeechDuration(0.5f)
2121
.setWindowSize(512)
22+
.setMaxSpeechDuration(5.0f)
2223
.build();
2324

2425
VadModelConfig config =

lazarus-examples/generate_subtitles/my_init.pas

+2-1
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,9 @@ function CreateVad(VadFilename: AnsiString): TSherpaOnnxVoiceActivityDetector;
4848
WindowSize := 512; {Please don't change it unless you know the details}
4949

5050
Config.SileroVad.Model := VadFilename;
51-
Config.SileroVad.MinSpeechDuration := 0.5;
51+
Config.SileroVad.MinSpeechDuration := 0.25;
5252
Config.SileroVad.MinSilenceDuration := 0.5;
53+
Config.SileroVad.MaxSpeechDuration := 5.0;
5354
Config.SileroVad.Threshold := 0.5;
5455
Config.SileroVad.WindowSize := WindowSize;
5556
Config.NumThreads:= 2;

nodejs-addon-examples/test_vad_with_non_streaming_asr_whisper.js

+1
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ function createVad() {
3434
threshold: 0.5,
3535
minSpeechDuration: 0.25,
3636
minSilenceDuration: 0.5,
37+
maxSpeechDuration: 5,
3738
windowSize: 512,
3839
},
3940
sampleRate: 16000,

nodejs-examples/test-vad-with-non-streaming-asr-whisper.js

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ function createVad() {
2929
threshold: 0.5,
3030
minSpeechDuration: 0.25,
3131
minSilenceDuration: 0.5,
32+
maxSpeechDuration: 5,
3233
windowSize: 512,
3334
},
3435
sampleRate: 16000,

python-api-examples/vad-remove-non-speech-segments-from-file.py

+9
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,15 @@ def main():
9090

9191
config = sherpa_onnx.VadModelConfig()
9292
config.silero_vad.model = args.silero_vad_model
93+
config.silero_vad.threshold = 0.5
94+
config.silero_vad.min_silence_duration = 0.25 # seconds
95+
config.silero_vad.min_speech_duration = 0.25 # seconds
96+
97+
# If the current segment is larger than this value, then it increases
98+
# the threshold to 0.9 internally. After detecting this segment,
99+
# it resets the threshold to its original value.
100+
config.silero_vad.max_speech_duration = 5 # seconds
101+
93102
config.sample_rate = sample_rate
94103

95104
window_size = config.silero_vad.window_size

scripts/dotnet/SileroVadModelConfig.cs

+3
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ public SileroVadModelConfig()
1414
MinSilenceDuration = 0.5F;
1515
MinSpeechDuration = 0.25F;
1616
WindowSize = 512;
17+
MaxSpeechDuration = 5.0F;
1718
}
1819

1920
[MarshalAs(UnmanagedType.LPStr)]
@@ -26,5 +27,7 @@ public SileroVadModelConfig()
2627
public float MinSpeechDuration;
2728

2829
public int WindowSize;
30+
31+
public float MaxSpeechDuration;
2932
}
3033
}

scripts/go/sherpa_onnx.go

+2
Original file line numberDiff line numberDiff line change
@@ -771,6 +771,7 @@ type SileroVadModelConfig struct {
771771
MinSilenceDuration float32
772772
MinSpeechDuration float32
773773
WindowSize int
774+
MaxSpeechDuration float32
774775
}
775776

776777
type VadModelConfig struct {
@@ -849,6 +850,7 @@ func NewVoiceActivityDetector(config *VadModelConfig, bufferSizeInSeconds float3
849850
c.silero_vad.min_silence_duration = C.float(config.SileroVad.MinSilenceDuration)
850851
c.silero_vad.min_speech_duration = C.float(config.SileroVad.MinSpeechDuration)
851852
c.silero_vad.window_size = C.int(config.SileroVad.WindowSize)
853+
c.silero_vad.max_speech_duration = C.float(config.SileroVad.MaxSpeechDuration)
852854

853855
c.sample_rate = C.int(config.SampleRate)
854856
c.num_threads = C.int(config.NumThreads)

scripts/node-addon-api/lib/vad.js

+3
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ config = {
3939
sileroVad: {
4040
model: "./silero_vad.onnx",
4141
threshold: 0.5,
42+
minSilenceDuration: 0.5,
43+
minSpeechDuration: 0.25,
44+
maxSpeechDuration: 5,
4245
}
4346
}
4447
*/

scripts/node-addon-api/src/vad.cc

+1
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,7 @@ static SherpaOnnxSileroVadModelConfig GetSileroVadConfig(
279279
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_silence_duration, minSilenceDuration);
280280
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_speech_duration, minSpeechDuration);
281281
SHERPA_ONNX_ASSIGN_ATTR_INT32(window_size, windowSize);
282+
SHERPA_ONNX_ASSIGN_ATTR_FLOAT(max_speech_duration, maxSpeechDuration);
282283

283284
return c;
284285
}

sherpa-onnx/c-api/c-api.cc

+3
Original file line numberDiff line numberDiff line change
@@ -907,6 +907,9 @@ SherpaOnnxVoiceActivityDetector *SherpaOnnxCreateVoiceActivityDetector(
907907
vad_config.silero_vad.window_size =
908908
SHERPA_ONNX_OR(config->silero_vad.window_size, 512);
909909

910+
vad_config.silero_vad.max_speech_duration =
911+
SHERPA_ONNX_OR(config->silero_vad.max_speech_duration, 20);
912+
910913
vad_config.sample_rate = SHERPA_ONNX_OR(config->sample_rate, 16000);
911914
vad_config.num_threads = SHERPA_ONNX_OR(config->num_threads, 1);
912915
vad_config.provider = SHERPA_ONNX_OR(config->provider, "cpu");

sherpa-onnx/c-api/c-api.h

+5
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,11 @@ SHERPA_ONNX_API typedef struct SherpaOnnxSileroVadModelConfig {
746746
float min_speech_duration;
747747

748748
int window_size;
749+
750+
// If a speech segment is longer than this value, then we increase
751+
// the threshold to 0.9. After finishing detecting the segment,
752+
// the threshold value is reset to its original value.
753+
float max_speech_duration;
749754
} SherpaOnnxSileroVadModelConfig;
750755

751756
SHERPA_ONNX_API typedef struct SherpaOnnxVadModelConfig {

sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/SileroVadModelConfig.java

+12
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@ public class SileroVadModelConfig {
88
private final float minSilenceDuration;
99
private final float minSpeechDuration;
1010
private final int windowSize;
11+
private final float maxSpeechDuration;
1112

1213
private SileroVadModelConfig(Builder builder) {
1314
this.model = builder.model;
1415
this.threshold = builder.threshold;
1516
this.minSilenceDuration = builder.minSilenceDuration;
1617
this.minSpeechDuration = builder.minSpeechDuration;
1718
this.windowSize = builder.windowSize;
19+
this.maxSpeechDuration = builder.maxSpeechDuration;
1820
}
1921

2022
public static Builder builder() {
@@ -41,12 +43,17 @@ public int getWindowSize() {
4143
return windowSize;
4244
}
4345

46+
public float getMaxSpeechDuration() {
47+
return maxSpeechDuration;
48+
}
49+
4450
public static class Builder {
4551
private String model = "";
4652
private float threshold = 0.5f;
4753
private float minSilenceDuration = 0.25f;
4854
private float minSpeechDuration = 0.5f;
4955
private int windowSize = 512;
56+
private float maxSpeechDuration = 5.0f;
5057

5158
public SileroVadModelConfig build() {
5259
return new SileroVadModelConfig(this);
@@ -77,5 +84,10 @@ public Builder setWindowSize(int windowSize) {
7784
this.windowSize = windowSize;
7885
return this;
7986
}
87+
88+
public Builder setMaxSpeechDuration(float maxSpeechDuration) {
89+
this.maxSpeechDuration = maxSpeechDuration;
90+
return this;
91+
}
8092
}
8193
}

sherpa-onnx/jni/voice-activity-detector.cc

+4
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ static VadModelConfig GetVadModelConfig(JNIEnv *env, jobject config) {
4040
fid = env->GetFieldID(silero_vad_config_cls, "windowSize", "I");
4141
ans.silero_vad.window_size = env->GetIntField(silero_vad_config, fid);
4242

43+
fid = env->GetFieldID(silero_vad_config_cls, "maxSpeechDuration", "F");
44+
ans.silero_vad.max_speech_duration =
45+
env->GetFloatField(silero_vad_config, fid);
46+
4347
fid = env->GetFieldID(cls, "sampleRate", "I");
4448
ans.sample_rate = env->GetIntField(config, fid);
4549

sherpa-onnx/kotlin-api/Vad.kt

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ data class SileroVadModelConfig(
99
var minSilenceDuration: Float = 0.25F,
1010
var minSpeechDuration: Float = 0.25F,
1111
var windowSize: Int = 512,
12+
var maxSpeechDuration: Float = 5.0F,
1213
)
1314

1415
data class VadModelConfig(

sherpa-onnx/pascal-api/sherpa_onnx.pas

+7-2
Original file line numberDiff line numberDiff line change
@@ -341,6 +341,7 @@ TSherpaOnnxSileroVadModelConfig = record
341341
MinSilenceDuration: Single;
342342
MinSpeechDuration: Single;
343343
WindowSize: Integer;
344+
MaxSpeechDuration: Single;
344345
function ToString: AnsiString;
345346
class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig);
346347
end;
@@ -594,6 +595,7 @@ SherpaOnnxSileroVadModelConfig = record
594595
MinSilenceDuration: cfloat;
595596
MinSpeechDuration: cfloat;
596597
WindowSize: cint32;
598+
MaxSpeechDuration: cfloat;
597599
end;
598600
SherpaOnnxVadModelConfig = record
599601
SileroVad: SherpaOnnxSileroVadModelConfig;
@@ -1402,10 +1404,11 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
14021404
'Threshold := %.2f, ' +
14031405
'MinSilenceDuration := %.2f, ' +
14041406
'MinSpeechDuration := %.2f, ' +
1405-
'WindowSize := %d' +
1407+
'WindowSize := %d, ' +
1408+
'MaxSpeechDuration := %.2f' +
14061409
')',
14071410
[Self.Model, Self.Threshold, Self.MinSilenceDuration,
1408-
Self.MinSpeechDuration, Self.WindowSize
1411+
Self.MinSpeechDuration, Self.WindowSize, Self.MaxSpeechDuration
14091412
]);
14101413
end;
14111414

@@ -1415,6 +1418,7 @@ function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString;
14151418
Dest.MinSilenceDuration := 0.5;
14161419
Dest.MinSpeechDuration := 0.25;
14171420
Dest.WindowSize := 512;
1421+
Dest.MaxSpeechDuration := 5.0;
14181422
end;
14191423

14201424
function TSherpaOnnxVadModelConfig.ToString: AnsiString;
@@ -1569,6 +1573,7 @@ constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelC
15691573
C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration;
15701574
C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration;
15711575
C.SileroVad.WindowSize := Config.SileroVad.WindowSize;
1576+
C.SileroVad.MaxSpeechDuration := Config.SileroVad.MaxSpeechDuration;
15721577

15731578
C.SampleRate := Config.SampleRate;
15741579
C.NumThreads := Config.NumThreads;

swift-api-examples/SherpaOnnx.swift

+4-2
Original file line numberDiff line numberDiff line change
@@ -550,14 +550,16 @@ func sherpaOnnxSileroVadModelConfig(
550550
threshold: Float = 0.5,
551551
minSilenceDuration: Float = 0.25,
552552
minSpeechDuration: Float = 0.5,
553-
windowSize: Int = 512
553+
windowSize: Int = 512,
554+
maxSpeechDuration: Float = 5.0
554555
) -> SherpaOnnxSileroVadModelConfig {
555556
return SherpaOnnxSileroVadModelConfig(
556557
model: toCPointer(model),
557558
threshold: threshold,
558559
min_silence_duration: minSilenceDuration,
559560
min_speech_duration: minSpeechDuration,
560-
window_size: Int32(windowSize)
561+
window_size: Int32(windowSize),
562+
max_speech_duration: maxSpeechDuration
561563
)
562564
}
563565

0 commit comments

Comments
 (0)