diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml index 98abc4c01..e780c39f7 100644 --- a/.github/workflows/pascal.yaml +++ b/.github/workflows/pascal.yaml @@ -116,12 +116,54 @@ jobs: cp -v install/lib/*.dll ../pascal-api-examples/read-wav cp -v install/lib/*.dll ../pascal-api-examples/streaming-asr cp -v install/lib/*.dll ../pascal-api-examples/non-streaming-asr + cp -v install/lib/*.dll ../pascal-api-examples/vad + cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr + cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad + cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad-with-non-streaming-asr fi + - name: Run Pascal test (VAD + non-streaming ASR) + shell: bash + run: | + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH + + cd ./pascal-api-examples + + pushd vad-with-non-streaming-asr + time ./run-vad-with-whisper.sh + rm -rf sherpa-onnx-* + echo "---" + + time ./run-vad-with-sense-voice.sh + rm -rf sherpa-onnx-* + echo "---" + + ls -lh + + popd + + - name: Run Pascal test (VAD test) + shell: bash + run: | + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH + + cd ./pascal-api-examples + + pushd vad + ./run-circular-buffer.sh + echo "---" + + time ./run-remove-silence.sh + echo "---" + + ls -lh + + popd + - name: Run Pascal test (Read wav test) shell: bash run: | diff --git a/pascal-api-examples/README.md b/pascal-api-examples/README.md index 125437788..76d5dfd37 100644 --- a/pascal-api-examples/README.md +++ b/pascal-api-examples/README.md @@ -8,3 +8,5 @@ APIs of [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx). |[read-wav](./read-wav)|It shows how to read a wave file.| |[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.| |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.| +|[vad](./vad)| It shows how to use the voice activity detection API.| +|[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.| diff --git a/pascal-api-examples/non-streaming-asr/nemo_ctc.pas b/pascal-api-examples/non-streaming-asr/nemo_ctc.pas index 0b4622c35..8acaa5f08 100644 --- a/pascal-api-examples/non-streaming-asr/nemo_ctc.pas +++ b/pascal-api-examples/non-streaming-asr/nemo_ctc.pas @@ -33,6 +33,8 @@ Duration: Single; RealTimeFactor: Single; begin + Initialize(Config); + Config.ModelConfig.NeMoCtC.Model := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/model.onnx'; Config.ModelConfig.Tokens := './sherpa-onnx-nemo-fast-conformer-ctc-be-de-en-es-fr-hr-it-pl-ru-uk-20k/tokens.txt'; Config.ModelConfig.Provider := 'cpu'; diff --git a/pascal-api-examples/non-streaming-asr/nemo_transducer.pas b/pascal-api-examples/non-streaming-asr/nemo_transducer.pas index cbd8c1fdf..1ed3ef8da 100644 --- a/pascal-api-examples/non-streaming-asr/nemo_transducer.pas +++ b/pascal-api-examples/non-streaming-asr/nemo_transducer.pas @@ -33,6 +33,8 @@ Duration: Single; RealTimeFactor: Single; begin + Initialize(Config); + Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/encoder.onnx'; Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/decoder.onnx'; Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-nemo-fast-conformer-transducer-be-de-en-es-fr-hr-it-pl-ru-uk-20k/joiner.onnx'; diff --git a/pascal-api-examples/non-streaming-asr/paraformer.pas b/pascal-api-examples/non-streaming-asr/paraformer.pas index 3ad76dc27..57d9956ec 100644 --- a/pascal-api-examples/non-streaming-asr/paraformer.pas +++ b/pascal-api-examples/non-streaming-asr/paraformer.pas @@ -33,6 +33,8 @@ Duration: Single; RealTimeFactor: Single; begin + Initialize(Config); + Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx'; Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt'; Config.ModelConfig.Provider := 'cpu'; diff --git a/pascal-api-examples/non-streaming-asr/paraformer_itn.pas b/pascal-api-examples/non-streaming-asr/paraformer_itn.pas index 172af597b..cde3be4cc 100644 --- a/pascal-api-examples/non-streaming-asr/paraformer_itn.pas +++ b/pascal-api-examples/non-streaming-asr/paraformer_itn.pas @@ -33,6 +33,8 @@ Duration: Single; RealTimeFactor: Single; begin + Initialize(Config); + Config.ModelConfig.Paraformer.Model := './sherpa-onnx-paraformer-zh-2023-09-14/model.int8.onnx'; Config.ModelConfig.Tokens := './sherpa-onnx-paraformer-zh-2023-09-14/tokens.txt'; Config.ModelConfig.Provider := 'cpu'; diff --git a/pascal-api-examples/non-streaming-asr/sense_voice.pas b/pascal-api-examples/non-streaming-asr/sense_voice.pas index 5963ba1e8..29a6c196a 100644 --- a/pascal-api-examples/non-streaming-asr/sense_voice.pas +++ b/pascal-api-examples/non-streaming-asr/sense_voice.pas @@ -33,6 +33,8 @@ Duration: Single; RealTimeFactor: Single; begin + Initialize(Config); + Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx'; Config.ModelConfig.SenseVoice.Language := 'auto'; Config.ModelConfig.SenseVoice.UseItn := False; diff --git a/pascal-api-examples/non-streaming-asr/telespeech_ctc.pas b/pascal-api-examples/non-streaming-asr/telespeech_ctc.pas index 8424775f0..3f53dd775 100644 --- a/pascal-api-examples/non-streaming-asr/telespeech_ctc.pas +++ b/pascal-api-examples/non-streaming-asr/telespeech_ctc.pas @@ -33,6 +33,8 @@ Duration: Single; RealTimeFactor: Single; begin + Initialize(Config); + Config.ModelConfig.TeleSpeechCtc := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/model.int8.onnx'; Config.ModelConfig.Tokens := './sherpa-onnx-telespeech-ctc-int8-zh-2024-06-04/tokens.txt'; Config.ModelConfig.Provider := 'cpu'; diff --git a/pascal-api-examples/non-streaming-asr/whisper.pas b/pascal-api-examples/non-streaming-asr/whisper.pas index f32c8e232..0e5222eb0 100644 --- a/pascal-api-examples/non-streaming-asr/whisper.pas +++ b/pascal-api-examples/non-streaming-asr/whisper.pas @@ -33,6 +33,8 @@ Duration: Single; RealTimeFactor: Single; begin + Initialize(Config); + Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx'; Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx'; Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt'; diff --git a/pascal-api-examples/non-streaming-asr/zipformer_transducer.pas b/pascal-api-examples/non-streaming-asr/zipformer_transducer.pas index 343a5c57e..a9fb22461 100644 --- a/pascal-api-examples/non-streaming-asr/zipformer_transducer.pas +++ b/pascal-api-examples/non-streaming-asr/zipformer_transducer.pas @@ -33,6 +33,8 @@ Duration: Single; RealTimeFactor: Single; begin + Initialize(Config); + Config.ModelConfig.Transducer.Encoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/encoder-epoch-30-avg-1.int8.onnx'; Config.ModelConfig.Transducer.Decoder := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/decoder-epoch-30-avg-1.onnx'; Config.ModelConfig.Transducer.Joiner := './sherpa-onnx-zipformer-gigaspeech-2023-12-12/joiner-epoch-30-avg-1.onnx'; diff --git a/pascal-api-examples/vad-with-non-streaming-asr/.gitignore b/pascal-api-examples/vad-with-non-streaming-asr/.gitignore new file mode 100644 index 000000000..4718ed421 --- /dev/null +++ b/pascal-api-examples/vad-with-non-streaming-asr/.gitignore @@ -0,0 +1,3 @@ +!run-*.sh +vad_with_whisper +vad_with_sense_voice diff --git a/pascal-api-examples/vad-with-non-streaming-asr/README.md b/pascal-api-examples/vad-with-non-streaming-asr/README.md new file mode 100644 index 000000000..220a55d62 --- /dev/null +++ b/pascal-api-examples/vad-with-non-streaming-asr/README.md @@ -0,0 +1,12 @@ +# Introduction + + +This directory contains examples for how to use the VAD (voice activity detection) +with non-streaming speech recognition models. + +|Directory| Description| +|---------|------------| +|[run-vad-with-whisper.sh](./run-vad-with-whisper.sh)|It shows how to use the VAD + Whisper for speech recognition.| +|[run-vad-with-sense-voice.sh](./run-vad-with-sense-voice.sh)|It shows how to use the VAD + SenseVoice for speech recognition.| + +Please refer to [non-streaming-asr](../non-streaming-asr) for more kinds of non-streaming models. diff --git a/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-sense-voice.sh b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-sense-voice.sh new file mode 100755 index 000000000..b339b6cd0 --- /dev/null +++ b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-sense-voice.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +if [[ ! -f ./silero_vad.onnx ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +fi + +if [ ! -f ./lei-jun-test.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +fi + +if [ ! -f ./sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + tar xvf sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 + rm sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17.tar.bz2 +fi + +fpc \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./vad_with_sense_voice.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./vad_with_sense_voice diff --git a/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-whisper.sh b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-whisper.sh new file mode 100755 index 000000000..260fdf36a --- /dev/null +++ b/pascal-api-examples/vad-with-non-streaming-asr/run-vad-with-whisper.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +if [[ ! -f ./silero_vad.onnx ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +fi + +if [ ! -f ./Obama.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/Obama.wav +fi + +if [ ! -f ./sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 + + tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 + rm sherpa-onnx-whisper-tiny.en.tar.bz2 +fi + +fpc \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./vad_with_whisper.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./vad_with_whisper diff --git a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas new file mode 100644 index 000000000..fff484db3 --- /dev/null +++ b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_sense_voice.pas @@ -0,0 +1,137 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming SenseVoice model +with silero VAD to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program vad_with_whisper; + +{$mode objfpc} + +uses + sherpa_onnx, + SysUtils; + +function CreateVad(): TSherpaOnnxVoiceActivityDetector; +var + Config: TSherpaOnnxVadModelConfig; + + SampleRate: Integer; + WindowSize: Integer; +begin + Initialize(Config); + + SampleRate := 16000; {Please don't change it unless you know the details} + WindowSize := 512; {Please don't change it unless you know the details} + + Config.SileroVad.Model := './silero_vad.onnx'; + Config.SileroVad.MinSpeechDuration := 0.5; + Config.SileroVad.MinSilenceDuration := 0.5; + Config.SileroVad.Threshold := 0.5; + Config.SileroVad.WindowSize := WindowSize; + Config.NumThreads:= 1; + Config.Debug:= True; + Config.Provider:= 'cpu'; + Config.SampleRate := SampleRate; + + Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30); +end; + +function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer; +var + Config: TSherpaOnnxOfflineRecognizerConfig; +begin + Initialize(Config); + + Config.ModelConfig.SenseVoice.Model := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/model.int8.onnx'; + Config.ModelConfig.SenseVoice.Language := 'auto'; + Config.ModelConfig.SenseVoice.UseItn := False; + Config.ModelConfig.Tokens := './sherpa-onnx-sense-voice-zh-en-ja-ko-yue-2024-07-17/tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + Result := TSherpaOnnxOfflineRecognizer.Create(Config); +end; + +var + Wave: TSherpaOnnxWave; + + Recognizer: TSherpaOnnxOfflineRecognizer; + Vad: TSherpaOnnxVoiceActivityDetector; + + Offset: Integer; + WindowSize: Integer; + SpeechSegment: TSherpaOnnxSpeechSegment; + + Start: Single; + Duration: Single; + + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; +begin + Vad := CreateVad(); + Recognizer := CreateOfflineRecognizer(); + + Wave := SherpaOnnxReadWave('./lei-jun-test.wav'); + if Wave.SampleRate <> Vad.Config.SampleRate then + begin + WriteLn(Format('Expected sample rate: %d. Given: %d', + [Vad.Config.SampleRate, Wave.SampleRate])); + + Exit; + end; + + WindowSize := Vad.Config.SileroVad.WindowSize; + Offset := 0; + while Offset + WindowSize <= Length(Wave.Samples) do + begin + Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize); + Offset += WindowSize; + + while not Vad.IsEmpty do + begin + SpeechSegment := Vad.Front(); + Vad.Pop(); + Stream := Recognizer.CreateStream(); + + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + RecognitionResult := Recognizer.GetResult(Stream); + + Start := SpeechSegment.Start / Wave.SampleRate; + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate; + WriteLn(Format('%.3f -- %.3f %s', + [Start, Start + Duration, RecognitionResult.Text])); + + FreeAndNil(Stream); + end; + end; + + Vad.Flush; + + while not Vad.IsEmpty do + begin + SpeechSegment := Vad.Front(); + Vad.Pop(); + Stream := Recognizer.CreateStream(); + + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + RecognitionResult := Recognizer.GetResult(Stream); + + Start := SpeechSegment.Start / Wave.SampleRate; + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate; + WriteLn(Format('%.3f -- %.3f %s', + [Start, Start + Duration, RecognitionResult.Text])); + + FreeAndNil(Stream); + end; + + FreeAndNil(Recognizer); + FreeAndNil(Vad); +end. diff --git a/pascal-api-examples/vad-with-non-streaming-asr/vad_with_whisper.pas b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_whisper.pas new file mode 100644 index 000000000..51df1815e --- /dev/null +++ b/pascal-api-examples/vad-with-non-streaming-asr/vad_with_whisper.pas @@ -0,0 +1,136 @@ +{ Copyright (c) 2024 Xiaomi Corporation } + +{ +This file shows how to use a non-streaming Whisper model +with silero VAD to decode files. + +You can download the model files from +https://github.com/k2-fsa/sherpa-onnx/releases/tag/asr-models +} + +program vad_with_whisper; + +{$mode objfpc} + +uses + sherpa_onnx, + SysUtils; + +function CreateVad(): TSherpaOnnxVoiceActivityDetector; +var + Config: TSherpaOnnxVadModelConfig; + + SampleRate: Integer; + WindowSize: Integer; +begin + Initialize(Config); + + SampleRate := 16000; {Please don't change it unless you know the details} + WindowSize := 512; {Please don't change it unless you know the details} + + Config.SileroVad.Model := './silero_vad.onnx'; + Config.SileroVad.MinSpeechDuration := 0.5; + Config.SileroVad.MinSilenceDuration := 0.5; + Config.SileroVad.Threshold := 0.5; + Config.SileroVad.WindowSize := WindowSize; + Config.NumThreads:= 1; + Config.Debug:= True; + Config.Provider:= 'cpu'; + Config.SampleRate := SampleRate; + + Result := TSherpaOnnxVoiceActivityDetector.Create(Config, 30); +end; + +function CreateOfflineRecognizer(): TSherpaOnnxOfflineRecognizer; +var + Config: TSherpaOnnxOfflineRecognizerConfig; +begin + Initialize(Config); + + Config.ModelConfig.Whisper.Encoder := './sherpa-onnx-whisper-tiny.en/tiny.en-encoder.int8.onnx'; + Config.ModelConfig.Whisper.Decoder := './sherpa-onnx-whisper-tiny.en/tiny.en-decoder.int8.onnx'; + Config.ModelConfig.Tokens := './sherpa-onnx-whisper-tiny.en/tiny.en-tokens.txt'; + Config.ModelConfig.Provider := 'cpu'; + Config.ModelConfig.NumThreads := 1; + Config.ModelConfig.Debug := False; + + Result := TSherpaOnnxOfflineRecognizer.Create(Config); +end; + +var + Wave: TSherpaOnnxWave; + + Recognizer: TSherpaOnnxOfflineRecognizer; + Vad: TSherpaOnnxVoiceActivityDetector; + + Offset: Integer; + WindowSize: Integer; + SpeechSegment: TSherpaOnnxSpeechSegment; + + Start: Single; + Duration: Single; + + Stream: TSherpaOnnxOfflineStream; + RecognitionResult: TSherpaOnnxOfflineRecognizerResult; +begin + Vad := CreateVad(); + Recognizer := CreateOfflineRecognizer(); + + Wave := SherpaOnnxReadWave('./Obama.wav'); + if Wave.SampleRate <> Vad.Config.SampleRate then + begin + WriteLn(Format('Expected sample rate: %d. Given: %d', + [Vad.Config.SampleRate, Wave.SampleRate])); + + Exit; + end; + + WindowSize := Vad.Config.SileroVad.WindowSize; + Offset := 0; + while Offset + WindowSize <= Length(Wave.Samples) do + begin + Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize); + Offset += WindowSize; + + while not Vad.IsEmpty do + begin + SpeechSegment := Vad.Front(); + Vad.Pop(); + Stream := Recognizer.CreateStream(); + + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + RecognitionResult := Recognizer.GetResult(Stream); + + Start := SpeechSegment.Start / Wave.SampleRate; + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate; + WriteLn(Format('%.3f -- %.3f %s', + [Start, Start + Duration, RecognitionResult.Text])); + + FreeAndNil(Stream); + end; + end; + + Vad.Flush; + + while not Vad.IsEmpty do + begin + SpeechSegment := Vad.Front(); + Vad.Pop(); + Stream := Recognizer.CreateStream(); + + Stream.AcceptWaveform(SpeechSegment.Samples, Wave.SampleRate); + Recognizer.Decode(Stream); + RecognitionResult := Recognizer.GetResult(Stream); + + Start := SpeechSegment.Start / Wave.SampleRate; + Duration := Length(SpeechSegment.Samples) / Wave.SampleRate; + WriteLn(Format('%.3f -- %.3f %s', + [Start, Start + Duration, RecognitionResult.Text])); + + FreeAndNil(Stream); + end; + + FreeAndNil(Recognizer); + FreeAndNil(Vad); +end. diff --git a/pascal-api-examples/vad/.gitignore b/pascal-api-examples/vad/.gitignore new file mode 100644 index 000000000..d7abe78d3 --- /dev/null +++ b/pascal-api-examples/vad/.gitignore @@ -0,0 +1,3 @@ +!run*.sh +circular_buffer +remove_silence diff --git a/pascal-api-examples/vad/README.md b/pascal-api-examples/vad/README.md new file mode 100644 index 000000000..f5d78243d --- /dev/null +++ b/pascal-api-examples/vad/README.md @@ -0,0 +1,11 @@ +# Introduction + + +This directory contains examples for how to use the VAD (voice activity detection) +APIs. + +|Directory| Description| +|---------|------------| +|[run-circular-buffer.sh](./run-circular-buffer.sh)|It shows how to use the circular buffer API.| +|[run-remove-silence.sh](./run-remove-silence.sh)|It shows how to use the VAD API to remove silences from a wave file.| + diff --git a/pascal-api-examples/vad/circular_buffer.pas b/pascal-api-examples/vad/circular_buffer.pas new file mode 100644 index 000000000..b7b6ecf26 --- /dev/null +++ b/pascal-api-examples/vad/circular_buffer.pas @@ -0,0 +1,106 @@ +{ Copyright (c) 2024 Xiaomi Corporation } +program circular_buffer; +{ +This file shows how to use the CircularBuffer API of sherpa-onnx +} + +{$mode objfpc} +{$ASSERTIONS ON} + +uses + sherpa_onnx; + +var + Buffer: TSherpaOnnxCircularBuffer; + Samples: TSherpaOnnxSamplesArray; +begin + {The initial capacity is 5. It will be resized automatically if needed.} + Buffer := TSherpaOnnxCircularBuffer.Create(5); + Assert(Buffer.Size = 0); + Assert(Buffer.Head = 0); + Buffer.Push([0, 10, 20]); + + {Push() changes Size. Head is not changed.} + Assert(Buffer.Size = 3); + Assert(Buffer.Head = 0); + + Samples := Buffer.Get(0, 1); + Assert(Length(Samples) = 1); + Assert(Samples[0] = 0); + + { Get() does not change Size or Head} + Assert(Buffer.Size = 3); + Assert(Buffer.Head = 0); + + Samples := Buffer.Get(0, 2); + Assert(Length(Samples) = 2); + Assert(Samples[0] = 0); + Assert(Samples[1] = 10); + + { The buffer will be resized since its initial capacity is 5 but we have + pushed 7 elements into it. + + No data is lost during the resize. + } + Buffer.Push([30, 40, 50, 60]); + + Assert(Buffer.Size = 7); {There are now 7 elements} + Assert(Buffer.Head = 0); + + {Remove the first 4 elements} + Buffer.Pop(4); + + Assert(Buffer.Size = 3); {There are only 3 elements left} + Assert(Buffer.Head = 4); + + Samples := Buffer.Get(Buffer.Head, 2); + Assert(Length(Samples) = 2); + Assert(Samples[0] = 40); + Assert(Samples[1] = 50); + + Buffer.Pop(1); + + Assert(Buffer.Size = 2); {There are only 2 elements left} + Assert(Buffer.Head = 5); + + Samples := Buffer.Get(Buffer.Head, 2); + Assert(Length(Samples) = 2); + Assert(Samples[0] = 50); + Assert(Samples[1] = 60); + + Buffer.Pop(2); + Assert(Buffer.Size = 0); {There are no elements left} + Assert(Buffer.Head = 7); + + Buffer.Push([100, 200, 300, 400, 500]); + Assert(Buffer.Size = 5); + Assert(Buffer.Head = 7); + + Buffer.Pop(4); + Assert(Buffer.Size = 1); + + {Head can be larger than the Capacity! + This is what circular means. It points to Buffer.Head / Capacity. + } + Assert(Buffer.Head = 11); + Buffer.Push([600, 700]); + + Assert(Buffer.Size = 3); + Assert(Buffer.Head = 11); + + Samples := Buffer.Get(Buffer.Head, 3); + Assert(Length(Samples) = 3); + Assert(Samples[0] = 500); + Assert(Samples[1] = 600); + Assert(Samples[2] = 700); + + Buffer.Pop(3); + Assert(Buffer.Size = 0); + Assert(Buffer.Head = 14); + + Buffer.Reset(); + + Assert(Buffer.Size = 0); + Assert(Buffer.Head = 0); +end. + diff --git a/pascal-api-examples/vad/remove_silence.pas b/pascal-api-examples/vad/remove_silence.pas new file mode 100644 index 000000000..141d5d225 --- /dev/null +++ b/pascal-api-examples/vad/remove_silence.pas @@ -0,0 +1,115 @@ +{ Copyright (c) 2024 Xiaomi Corporation } +{ +This file shows how to use the VAD API from sherpa-onnx +to remove silences from a wave file. +} +program main; + +{$mode delphi} + +uses + sherpa_onnx, + SysUtils; + +var + Wave: TSherpaOnnxWave; + + Config: TSherpaOnnxVadModelConfig; + Vad: TSherpaOnnxVoiceActivityDetector; + Offset: Integer; + WindowSize: Integer; + SpeechSegment: TSherpaOnnxSpeechSegment; + + Start: Single; + Duration: Single; + SampleRate: Integer; + + AllSpeechSegment: array of TSherpaOnnxSpeechSegment; + AllSamples: array of Single; + N: Integer; + I: Integer; +begin + SampleRate := 16000; {Please don't change it unless you know the details} + + Wave := SherpaOnnxReadWave('./lei-jun-test.wav'); + if Wave.SampleRate <> SampleRate then + begin + WriteLn(Format('Expected sample rate: %d. Given: %d', + [SampleRate, Wave.SampleRate])); + + Exit; + end; + + WindowSize := 512; {Please don't change it unless you know the details} + Initialize(Config); + + Config.SileroVad.Model := './silero_vad.onnx'; + Config.SileroVad.MinSpeechDuration := 0.25; + Config.SileroVad.MinSilenceDuration := 0.5; + Config.SileroVad.Threshold := 0.5; + Config.SileroVad.WindowSize := WindowSize; + Config.NumThreads:= 1; + Config.Debug:= True; + Config.Provider:= 'cpu'; + Config.SampleRate := SampleRate; + + Vad := TSherpaOnnxVoiceActivityDetector.Create(Config, 20); + + AllSpeechSegment := nil; + AllSamples := nil; + Offset := 0; + while Offset + WindowSize <= Length(Wave.Samples) do + begin + Vad.AcceptWaveform(Wave.Samples, Offset, WindowSize); + Inc(Offset, WindowSize); + + while not Vad.IsEmpty do + begin + SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1); + + SpeechSegment := Vad.Front(); + Vad.Pop(); + AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment; + + Start := SpeechSegment.Start / SampleRate; + Duration := Length(SpeechSegment.Samples) / SampleRate; + WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration])); + end; + end; + + Vad.Flush; + + while not Vad.IsEmpty do + begin + SetLength(AllSpeechSegment, Length(AllSpeechSegment) + 1); + + SpeechSegment := Vad.Front(); + Vad.Pop(); + AllSpeechSegment[Length(AllSpeechSegment)-1] := SpeechSegment; + + Start := SpeechSegment.Start / SampleRate; + Duration := Length(SpeechSegment.Samples) / SampleRate; + WriteLn(Format('%.3f -- %.3f', [Start, Start + Duration])); + end; + + N := 0; + for SpeechSegment in AllSpeechSegment do + Inc(N, Length(SpeechSegment.Samples)); + + SetLength(AllSamples, N); + + N := 0; + for SpeechSegment in AllSpeechSegment do + begin + for I := Low(SpeechSegment.Samples) to High(SpeechSegment.Samples) do + begin + AllSamples[N] := SpeechSegment.Samples[I]; + Inc(N); + end; + end; + + SherpaOnnxWriteWave('./lei-jun-test-no-silence.wav', AllSamples, SampleRate); + WriteLn('Saved to ./lei-jun-test-no-silence.wav'); + + FreeAndNil(Vad); +end. diff --git a/pascal-api-examples/vad/run-circular-buffer.sh b/pascal-api-examples/vad/run-circular-buffer.sh new file mode 100755 index 000000000..b46524b80 --- /dev/null +++ b/pascal-api-examples/vad/run-circular-buffer.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +fpc \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./circular_buffer.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./circular_buffer diff --git a/pascal-api-examples/vad/run-remove-silence.sh b/pascal-api-examples/vad/run-remove-silence.sh new file mode 100755 index 000000000..d8157cb6d --- /dev/null +++ b/pascal-api-examples/vad/run-remove-silence.sh @@ -0,0 +1,42 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +if [[ ! -f ./silero_vad.onnx ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx +fi + +if [ ! -f ./lei-jun-test.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/lei-jun-test.wav +fi + +fpc \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./remove_silence.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./remove_silence diff --git a/sherpa-onnx/csrc/circular-buffer.cc b/sherpa-onnx/csrc/circular-buffer.cc index ef937cabe..2fd19cdfa 100644 --- a/sherpa-onnx/csrc/circular-buffer.cc +++ b/sherpa-onnx/csrc/circular-buffer.cc @@ -95,6 +95,8 @@ void CircularBuffer::Push(const float *p, int32_t n) { "capacity to: %d", n, size, n + size, capacity, new_capacity); Resize(new_capacity); + + capacity = new_capacity; } int32_t start = tail_ % capacity; diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index 8e1f03b72..6b6ccec6c 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -2,9 +2,11 @@ unit sherpa_onnx; -{$mode objfpc} +{$IFDEF FPC} + {$mode objfpc} + {$modeSwitch advancedRecords} { to support records with methods } +{$ENDIF} -{$modeSwitch advancedRecords} { to support records with methods } (* {$LongStrings ON} *) interface @@ -45,18 +47,21 @@ TSherpaOnnxOnlineModelConfig = record ModelingUnit: AnsiString; BpeVocab: AnsiString; function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig); end; TSherpaOnnxFeatureConfig = record SampleRate: Integer; FeatureDim: Integer; function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig); end; TSherpaOnnxOnlineCtcFstDecoderConfig = record Graph: AnsiString; MaxActive: Integer; function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig); end; TSherpaOnnxOnlineRecognizerConfig = record @@ -75,6 +80,7 @@ TSherpaOnnxOnlineRecognizerConfig = record RuleFars: AnsiString; BlankPenalty: Single; function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig); end; TSherpaOnnxOnlineRecognizerResult = record @@ -97,6 +103,7 @@ TSherpaOnnxOnlineStream = class TSherpaOnnxOnlineRecognizer = class private Handle: Pointer; + _Config: TSherpaOnnxOnlineRecognizerConfig; public constructor Create(Config: TSherpaOnnxOnlineRecognizerConfig); destructor Destroy; override; @@ -108,6 +115,7 @@ TSherpaOnnxOnlineRecognizer = class procedure Reset(Stream: TSherpaOnnxOnlineStream); function IsEndpoint(Stream: TSherpaOnnxOnlineStream): Boolean; function GetResult(Stream: TSherpaOnnxOnlineStream): TSherpaOnnxOnlineRecognizerResult; + property Config: TSherpaOnnxOnlineRecognizerConfig Read _Config; end; TSherpaOnnxOfflineTransducerModelConfig = record @@ -134,6 +142,7 @@ TSherpaOnnxOfflineWhisperModelConfig = record Task: AnsiString; TailPaddings: Integer; function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig); end; TSherpaOnnxOfflineTdnnModelConfig = record @@ -145,12 +154,14 @@ TSherpaOnnxOfflineLMConfig = record Model: AnsiString; Scale: Single; function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig); end; TSherpaOnnxOfflineSenseVoiceModelConfig = record Model: AnsiString; Language: AnsiString; UseItn: Boolean; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig); function ToString: AnsiString; end; @@ -169,6 +180,7 @@ TSherpaOnnxOfflineModelConfig = record BpeVocab: AnsiString; TeleSpeechCtc: AnsiString; SenseVoice: TSherpaOnnxOfflineSenseVoiceModelConfig; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig); function ToString: AnsiString; end; @@ -183,6 +195,7 @@ TSherpaOnnxOfflineRecognizerConfig = record RuleFsts: AnsiString; RuleFars: AnsiString; BlankPenalty: Single; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig); function ToString: AnsiString; end; @@ -205,18 +218,83 @@ TSherpaOnnxOfflineStream = class TSherpaOnnxOfflineRecognizer = class private Handle: Pointer; + _Config: TSherpaOnnxOfflineRecognizerConfig; public constructor Create(Config: TSherpaOnnxOfflineRecognizerConfig); destructor Destroy; override; function CreateStream: TSherpaOnnxOfflineStream; procedure Decode(Stream: TSherpaOnnxOfflineStream); function GetResult(Stream: TSherpaOnnxOfflineStream): TSherpaOnnxOfflineRecognizerResult; + property Config: TSherpaOnnxOfflineRecognizerConfig Read _Config; end; -{ It supports reading a single channel wave with 16-bit encoded samples. - Samples are normalized to the range [-1, 1]. -} -function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave; + TSherpaOnnxSileroVadModelConfig = record + Model: AnsiString; + Threshold: Single; + MinSilenceDuration: Single; + MinSpeechDuration: Single; + WindowSize: Integer; + function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig); + end; + + TSherpaOnnxVadModelConfig = record + SileroVad: TSherpaOnnxSileroVadModelConfig; + SampleRate: Integer; + NumThreads: Integer; + Provider: AnsiString; + Debug: Boolean; + function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig); + end; + + TSherpaOnnxSamplesArray = array of Single; + + TSherpaOnnxCircularBuffer = class + private + Handle: Pointer; + public + constructor Create(Capacity: Integer); + destructor Destroy; override; + procedure Push(Samples: array of Single); + function Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray; + procedure Pop(N: Integer); + procedure Reset; + function Size: Integer; + function Head: Integer; + end; + + TSherpaOnnxSpeechSegment = record + Samples: array of Single; + Start: Integer; + end; + + TSherpaOnnxVoiceActivityDetector = class + private + Handle: Pointer; + _Config: TSherpaOnnxVadModelConfig; + public + constructor Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single); + destructor Destroy; override; + procedure AcceptWaveform(Samples: array of Single); overload; + procedure AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer); overload; + function IsEmpty: Boolean; + function IsDetected: Boolean; + procedure Pop; + procedure Clear; + function Front: TSherpaOnnxSpeechSegment; + procedure Reset; + procedure Flush; + property Config: TSherpaOnnxVadModelConfig Read _Config; + end; + + { It supports reading a single channel wave with 16-bit encoded samples. + Samples are normalized to the range [-1, 1]. + } + function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave; + + function SherpaOnnxWriteWave(Filename: AnsiString; + Samples: array of Single; SampleRate: Integer): Boolean; implementation @@ -294,15 +372,15 @@ SherpaOnnxOnlineRecognizerConfig = record DecodingMethod: PAnsiChar; MaxActivePaths: cint32; EnableEndpoint: cint32; - Rule1MinTrailingSilence: Single; - Rule2MinTrailingSilence: Single; - Rule3MinUtteranceLength: Single; + Rule1MinTrailingSilence: cfloat; + Rule2MinTrailingSilence: cfloat; + Rule3MinUtteranceLength: cfloat; HotwordsFile: PAnsiChar; - HotwordsScore: Single; + HotwordsScore: cfloat; CtcFstDecoderConfig: SherpaOnnxOnlineCtcFstDecoderConfig; RuleFsts: PAnsiChar; RuleFars: PAnsiChar; - BlankPenalty: Single; + BlankPenalty: cfloat; end; PSherpaOnnxOnlineRecognizerConfig = ^SherpaOnnxOnlineRecognizerConfig; @@ -330,7 +408,7 @@ SherpaOnnxOfflineTdnnModelConfig = record end; SherpaOnnxOfflineLMConfig = record Model: PAnsiChar; - Scale: Single; + Scale: cfloat; end; SherpaOnnxOfflineSenseVoiceModelConfig = record Model: PAnsiChar; @@ -361,14 +439,100 @@ SherpaOnnxOfflineRecognizerConfig = record DecodingMethod: PAnsiChar; MaxActivePaths: cint32; HotwordsFile: PAnsiChar; - HotwordsScore: Single; + HotwordsScore: cfloat; RuleFsts: PAnsiChar; RuleFars: PAnsiChar; - BlankPenalty: Single; + BlankPenalty: cfloat; end; PSherpaOnnxOfflineRecognizerConfig = ^SherpaOnnxOfflineRecognizerConfig; + SherpaOnnxSileroVadModelConfig = record + Model: PAnsiChar; + Threshold: cfloat; + MinSilenceDuration: cfloat; + MinSpeechDuration: cfloat; + WindowSize: cint32; + end; + SherpaOnnxVadModelConfig = record + SileroVad: SherpaOnnxSileroVadModelConfig; + SampleRate: cint32; + NumThreads: cint32; + Provider: PAnsiChar; + Debug: cint32; + end; + PSherpaOnnxVadModelConfig = ^SherpaOnnxVadModelConfig; + + SherpaOnnxSpeechSegment = record + Start: cint32; + Samples: pcfloat; + N: cint32; + end; + + PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment; + +function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig; + BufferSizeInSeconds: cfloat): Pointer; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxDestroyVoiceActivityDetector(Vad: Pointer); cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxVoiceActivityDetectorAcceptWaveform(Vad: Pointer; + Samples: pcfloat; N: cint32); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxVoiceActivityDetectorEmpty(Vad: Pointer): cint32; cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxVoiceActivityDetectorDetected(Vad: Pointer): cint32; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxVoiceActivityDetectorPop(Vad: Pointer); cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxVoiceActivityDetectorClear(Vad: Pointer); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxVoiceActivityDetectorFront(Vad: Pointer): PSherpaOnnxSpeechSegment; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxDestroySpeechSegment(P: PSherpaOnnxSpeechSegment); cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxVoiceActivityDetectorReset(P: PSherpaOnnxSpeechSegment); cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxVoiceActivityDetectorFlush(P: PSherpaOnnxSpeechSegment); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxCreateCircularBuffer(Capacity: cint32): Pointer; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxDestroyCircularBuffer(Buffer: Pointer) ; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxCircularBufferPush(Buffer: Pointer; Samples: pcfloat; N: cint32); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxCircularBufferGet(Buffer: Pointer; StartIndex: cint32; N: cint32): pcfloat ; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxCircularBufferFree(P: pcfloat); cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxCircularBufferPop(Buffer: Pointer; N: cint32); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxCircularBufferSize(Buffer: Pointer): cint32; cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxCircularBufferHead(Buffer: Pointer): cint32; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxCircularBufferReset(Buffer: Pointer); cdecl; + external SherpaOnnxLibName; + function SherpaOnnxCreateOnlineRecognizer(Config: PSherpaOnnxOnlineRecognizerConfig): Pointer; cdecl; external SherpaOnnxLibName; @@ -437,9 +601,20 @@ procedure SherpaOnnxDestroyOfflineStreamResultJson(Json: PAnsiChar); cdecl; function SherpaOnnxReadWaveWrapper(Filename: PAnsiChar): PSherpaOnnxWave; cdecl; external SherpaOnnxLibName name 'SherpaOnnxReadWave'; +function SherpaOnnxWriteWaveWrapper(Samples: pcfloat; N: cint32; + SampleRate: cint32; Filename: PAnsiChar): cint32; cdecl; + external SherpaOnnxLibName name 'SherpaOnnxWriteWave'; + procedure SherpaOnnxFreeWaveWrapper(P: PSherpaOnnxWave); cdecl; external SherpaOnnxLibName name 'SherpaOnnxFreeWave'; +function SherpaOnnxWriteWave(Filename: AnsiString; + Samples: array of Single; SampleRate: Integer): Boolean; +begin + Result := SherpaOnnxWriteWaveWrapper(pcfloat(Samples), Length(Samples), + SampleRate, PAnsiChar(Filename)) = 1; +end; + function SherpaOnnxReadWave(Filename: AnsiString): TSherpaOnnxWave; var PFilename: PAnsiChar; @@ -611,6 +786,7 @@ constructor TSherpaOnnxOnlineRecognizer.Create(Config: TSherpaOnnxOnlineRecogniz C.BlankPenalty := Config.BlankPenalty; Self.Handle := SherpaOnnxCreateOnlineRecognizer(@C); + Self._Config := Config; end; destructor TSherpaOnnxOnlineRecognizer.Destroy; @@ -877,6 +1053,7 @@ constructor TSherpaOnnxOfflineRecognizer.Create(Config: TSherpaOnnxOfflineRecogn C.BlankPenalty := Config.BlankPenalty; Self.Handle := SherpaOnnxCreateOfflineRecognizer(@C); + Self._Config := Config; end; destructor TSherpaOnnxOfflineRecognizer.Destroy; @@ -984,5 +1161,255 @@ function TSherpaOnnxOfflineRecognizerResult.ToString: AnsiString; [Self.Text, TokensStr, TimestampStr]); end; +function TSherpaOnnxSileroVadModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxSileroVadModelConfig(' + + 'Model := %s, ' + + 'Threshold := %.2f, ' + + 'MinSilenceDuration := %.2f, ' + + 'MinSpeechDuration := %.2f, ' + + 'WindowSize := %d' + + ')', + [Self.Model, Self.Threshold, Self.MinSilenceDuration, + Self.MinSpeechDuration, Self.WindowSize + ]); +end; + +class operator TSherpaOnnxSileroVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxSileroVadModelConfig); +begin + Dest.Threshold := 0.5; + Dest.MinSilenceDuration := 0.5; + Dest.MinSpeechDuration := 0.25; + Dest.WindowSize := 512; +end; + +function TSherpaOnnxVadModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxVadModelConfig(' + + 'SileroVad := %s, ' + + 'SampleRate := %d, ' + + 'NumThreads := %d, ' + + 'Provider := %s, ' + + 'Debug := %s' + + ')', + [Self.SileroVad.ToString, Self.SampleRate, Self.NumThreads, Self.Provider, + Self.Debug.ToString + ]); +end; + +class operator TSherpaOnnxVadModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig); +begin + Dest.SampleRate := 16000; + Dest.NumThreads := 1; + Dest.Provider := 'cpu'; + Dest.Debug := False; +end; + +class operator TSherpaOnnxFeatureConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxFeatureConfig); +begin + Dest.SampleRate := 16000; + Dest.FeatureDim := 80; +end; + +class operator TSherpaOnnxOnlineCtcFstDecoderConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineCtcFstDecoderConfig); +begin + Dest.MaxActive := 3000; +end; + +class operator TSherpaOnnxOnlineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineRecognizerConfig); +begin + Dest.DecodingMethod := 'greedy_search'; + Dest.EnableEndpoint := False; + Dest.Rule1MinTrailingSilence := 2.4; + Dest.Rule2MinTrailingSilence := 1.2; + Dest.Rule3MinUtteranceLength := 20; + Dest.HotwordsScore := 1.5; + Dest.BlankPenalty := 0; +end; + +class operator TSherpaOnnxOnlineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOnlineModelConfig); +begin + Dest.NumThreads := 1; + Dest.Provider := 'cpu'; + Dest.Debug := False; +end; + +class operator TSherpaOnnxOfflineWhisperModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineWhisperModelConfig); +begin + Dest.Task := 'transcribe'; + Dest.TailPaddings := -1; +end; + +class operator TSherpaOnnxOfflineLMConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineLMConfig); +begin + Dest.Scale := 1.0; +end; + +class operator TSherpaOnnxOfflineSenseVoiceModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineSenseVoiceModelConfig); +begin + Dest.UseItn := True; +end; + +class operator TSherpaOnnxOfflineModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineModelConfig); +begin + Dest.NumThreads := 1; + Dest.Debug := False; + Dest.Provider := 'cpu'; +end; + +class operator TSherpaOnnxOfflineRecognizerConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineRecognizerConfig); +begin + Dest.DecodingMethod := 'greedy_search'; + Dest.MaxActivePaths := 4; + Dest.HotwordsScore := 1.5; + Dest.BlankPenalty := 0; +end; + +constructor TSherpaOnnxCircularBuffer.Create(Capacity: Integer); +begin + Self.Handle := SherpaOnnxCreateCircularBuffer(Capacity); +end; + +destructor TSherpaOnnxCircularBuffer.Destroy; +begin + SherpaOnnxDestroyCircularBuffer(Self.Handle); + Self.Handle := nil; +end; + +procedure TSherpaOnnxCircularBuffer.Push(Samples: array of Single); +begin + SherpaOnnxCircularBufferPush(Self.Handle, pcfloat(Samples), Length(Samples)); +end; + +function TSherpaOnnxCircularBuffer.Get(StartIndex: Integer; N: Integer): TSherpaOnnxSamplesArray; +var + P: pcfloat; + I: Integer; +begin + P := SherpaOnnxCircularBufferGet(Self.Handle, StartIndex, N); + + Result := nil; + + SetLength(Result, N); + + for I := Low(Result) to High(Result) do + Result[I] := P[I]; + + SherpaOnnxCircularBufferFree(P); +end; + +procedure TSherpaOnnxCircularBuffer.Pop(N: Integer); +begin + SherpaOnnxCircularBufferPop(Self.Handle, N); +end; + +procedure TSherpaOnnxCircularBuffer.Reset; +begin + SherpaOnnxCircularBufferReset(Self.Handle); +end; + +function TSherpaOnnxCircularBuffer.Size: Integer; +begin + Result := SherpaOnnxCircularBufferSize(Self.Handle); +end; + +function TSherpaOnnxCircularBuffer.Head: Integer; +begin + Result := SherpaOnnxCircularBufferHead(Self.Handle); +end; + +constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single); +var + C: SherpaOnnxVadModelConfig; +begin + Self._Config := Config; + + Initialize(C); + + C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model); + C.SileroVad.Threshold := Config.SileroVad.Threshold; + C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration; + C.SileroVad.MinSpeechDuration := Config.SileroVad.MinSpeechDuration; + C.SileroVad.WindowSize := Config.SileroVad.WindowSize; + + C.SampleRate := Config.SampleRate; + C.NumThreads := Config.NumThreads; + C.Provider := PAnsiChar(Config.Provider); + C.Debug := Ord(Config.Debug); + + Self.Handle := SherpaOnnxCreateVoiceActivityDetector(@C, BufferSizeInSeconds); +end; + +destructor TSherpaOnnxVoiceActivityDetector.Destroy; +begin + SherpaOnnxDestroyVoiceActivityDetector(Self.Handle); + Self.Handle := nil; +end; + +procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single); +begin + SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle, pcfloat(Samples), Length(Samples)); +end; + +procedure TSherpaOnnxVoiceActivityDetector.AcceptWaveform(Samples: array of Single; Offset: Integer; N: Integer); +begin + if Offset + N > Length(Samples) then + begin + WriteLn(Format('Invalid arguments!. Array length: %d, Offset: %d, N: %d', + [Length(Samples), Offset, N] + )); + Exit; + end; + + SherpaOnnxVoiceActivityDetectorAcceptWaveform(Self.Handle, + pcfloat(Samples) + Offset, N); +end; + +function TSherpaOnnxVoiceActivityDetector.IsEmpty: Boolean; +begin + Result := SherpaOnnxVoiceActivityDetectorEmpty(Self.Handle) = 1; +end; + +function TSherpaOnnxVoiceActivityDetector.IsDetected: Boolean; +begin + Result := SherpaOnnxVoiceActivityDetectorDetected(Self.Handle) = 1; +end; + +procedure TSherpaOnnxVoiceActivityDetector.Pop; +begin + SherpaOnnxVoiceActivityDetectorPop(Self.Handle); +end; + +procedure TSherpaOnnxVoiceActivityDetector.Clear; +begin + SherpaOnnxVoiceActivityDetectorClear(Self.Handle); +end; + +function TSherpaOnnxVoiceActivityDetector.Front: TSherpaOnnxSpeechSegment; +var + P: PSherpaOnnxSpeechSegment; + I: Integer; +begin + P := SherpaOnnxVoiceActivityDetectorFront(Self.Handle); + Result.Start := P^.Start; + Result.Samples := nil; + SetLength(Result.Samples, P^.N); + + for I := Low(Result.Samples) to High(Result.Samples) do + Result.Samples[I] := P^.Samples[I]; + + SherpaOnnxDestroySpeechSegment(P); +end; + +procedure TSherpaOnnxVoiceActivityDetector.Reset; +begin + SherpaOnnxVoiceActivityDetectorReset(Self.Handle); +end; + +procedure TSherpaOnnxVoiceActivityDetector.Flush; +begin + SherpaOnnxVoiceActivityDetectorFlush(Self.Handle); +end; + end.