diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml index e780c39f7..2ed213184 100644 --- a/.github/workflows/pascal.yaml +++ b/.github/workflows/pascal.yaml @@ -119,13 +119,29 @@ jobs: cp -v install/lib/*.dll ../pascal-api-examples/vad cp -v install/lib/*.dll ../pascal-api-examples/vad-with-non-streaming-asr - cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/read-wav - cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/streaming-asr - cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/non-streaming-asr - cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad - cp -v ../sherpa-onnx/pascal-api/sherpa_onnx.pas ../pascal-api-examples/vad-with-non-streaming-asr + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/read-wav + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/streaming-asr + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/non-streaming-asr + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/vad-with-non-streaming-asr + cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts fi + - name: Run Pascal test (TTS) + shell: bash + run: | + export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH + + cd ./pascal-api-examples + pushd tts + + ./run-piper.sh + rm -rf vits-piper-* + ls -lh + echo "---" + + popd + - name: Run Pascal test (VAD + non-streaming ASR) shell: bash run: | diff --git a/pascal-api-examples/.gitignore b/pascal-api-examples/.gitignore new file mode 100644 index 000000000..30740b9de --- /dev/null +++ b/pascal-api-examples/.gitignore @@ -0,0 +1 @@ +link*.res diff --git a/pascal-api-examples/README.md b/pascal-api-examples/README.md index ffbf8e9ef..5475d825b 100644 --- a/pascal-api-examples/README.md +++ b/pascal-api-examples/README.md @@ -13,3 +13,5 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html |[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.| |[vad](./vad)| It shows how to use the voice activity detection API.| |[vad-with-non-streaming-asr](./vad-with-non-streaming-asr)| It shows how to use the voice activity detection API with non-streaming models for speech recognition.| +|[portaudio-test](./portaudio-test)| It shows how to use PortAudio for recording and playing.| +|[tts](./tts)| It shows how to use the text-to-speech API.| diff --git a/pascal-api-examples/tts/.gitignore b/pascal-api-examples/tts/.gitignore new file mode 100644 index 000000000..b7076ab5c --- /dev/null +++ b/pascal-api-examples/tts/.gitignore @@ -0,0 +1,4 @@ +!run-*.sh +piper +piper-playback +link*.res diff --git a/pascal-api-examples/tts/README.md b/pascal-api-examples/tts/README.md new file mode 100644 index 000000000..4bb050290 --- /dev/null +++ b/pascal-api-examples/tts/README.md @@ -0,0 +1,9 @@ +# Introduction + +This directory contains examples for how to use the TTS (text to speech) APIs. + +|Directory| Description| +|---------|------------| +|[run-piper.sh](./run-piper.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech.| +|[run-piper-playback.sh](./run-piper-playback.sh)|It shows how to use models from [piper](https://github.com/rhasspy/piper) for text to speech. It plays the generated audio as it is still generating. | + diff --git a/pascal-api-examples/tts/piper-playback.pas b/pascal-api-examples/tts/piper-playback.pas new file mode 100644 index 000000000..b9cd10b71 --- /dev/null +++ b/pascal-api-examples/tts/piper-playback.pas @@ -0,0 +1,238 @@ +{ Copyright (c) 2024 Xiaomi Corporation } +program piper; +{ +This file shows how to use the text to speech API of sherpa-onnx +with Piper models. + +It generates speech from text and saves it to a wave file. + +Note that it plays the audio back as it is still generating. +} + +{$mode objfpc} + +uses + {$ifdef unix} + cthreads, + {$endif} + SysUtils, + dos, + ctypes, + portaudio, + sherpa_onnx; + +var + CriticalSection: TRTLCriticalSection; + + Tts: TSherpaOnnxOfflineTts; + Audio: TSherpaOnnxGeneratedAudio; + Resampler: TSherpaOnnxLinearResampler; + + Text: AnsiString; + Speed: Single = 1.0; {Use a larger value to speak faster} + SpeakerId: Integer = 0; + Buffer: TSherpaOnnxCircularBuffer; + FinishedGeneration: Boolean = False; + FinishedPlaying: Boolean = False; + + Version: String; + EnvStr: String; + Status: Integer; + NumDevices: Integer; + DeviceIndex: Integer; + DeviceInfo: PPaDeviceInfo; + + { If you get EDivByZero: Division by zero error, please change the sample rate + to the one supported by your microphone. + } + DeviceSampleRate: Integer = 48000; + I: Integer; + Param: TPaStreamParameters; + Stream: PPaStream; + Wave: TSherpaOnnxWave; + +function GenerateCallback( + Samples: pcfloat; N: cint32; + Arg: Pointer): cint; cdecl; +begin + EnterCriticalSection(CriticalSection); + try + if Resampler <> nil then + Buffer.Push(Resampler.Resample(Samples, N, False)) + else + Buffer.Push(Samples, N); + finally + LeaveCriticalSection(CriticalSection); + end; + + { 1 means to continue generating; 0 means to stop generating. } + Result := 1; +end; + +function PlayCallback( + input: Pointer; output: Pointer; + frameCount: culong; + timeInfo: PPaStreamCallbackTimeInfo; + statusFlags: TPaStreamCallbackFlags; + userData: Pointer ): cint; cdecl; +var + Samples: TSherpaOnnxSamplesArray; + I: Integer; +begin + EnterCriticalSection(CriticalSection); + try + if Buffer.Size >= frameCount then + begin + Samples := Buffer.Get(Buffer.Head, FrameCount); + Buffer.Pop(FrameCount); + end + else if Buffer.Size > 0 then + begin + Samples := Buffer.Get(Buffer.Head, Buffer.Size); + Buffer.Pop(Buffer.Size); + SetLength(Samples, frameCount); + end + else + SetLength(Samples, frameCount); + + for I := 0 to frameCount - 1 do + pcfloat(output)[I] := Samples[I]; + + if (Buffer.Size > 0) or (not FinishedGeneration) then + Result := paContinue + else + begin + Result := paComplete; + FinishedPlaying := True; + end; + finally + LeaveCriticalSection(CriticalSection); + end; +end; + +function GetOfflineTts: TSherpaOnnxOfflineTts; +var + Config: TSherpaOnnxOfflineTtsConfig; +begin + Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx'; + Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt'; + Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data'; + Config.Model.NumThreads := 1; + Config.Model.Debug := False; + Config.MaxNumSentences := 1; + + Result := TSherpaOnnxOfflineTts.Create(Config); +end; + +begin + Tts := GetOfflineTts; + if Tts.GetSampleRate <> DeviceSampleRate then + Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate); + + Version := String(Pa_GetVersionText); + WriteLn('Version is ', Version); + Status := Pa_Initialize; + if Status <> paNoError then + begin + WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status)); + Exit; + end; + + NumDevices := Pa_GetDeviceCount; + WriteLn('Num devices: ', NumDevices); + + DeviceIndex := Pa_GetDefaultOutputDevice; + + if DeviceIndex = paNoDevice then + begin + WriteLn('No default output device found'); + Pa_Terminate; + Exit; + end; + + EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE'); + if EnvStr <> '' then + begin + DeviceIndex := StrToIntDef(EnvStr, DeviceIndex); + WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr); + end; + + for I := 0 to (NumDevices - 1) do + begin + DeviceInfo := Pa_GetDeviceInfo(I); + if I = DeviceIndex then + { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) } + WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)])) + else + WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)])); + end; + + WriteLn('Use device ', DeviceIndex); + WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name); + WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels); + + Initialize(Param); + Param.Device := DeviceIndex; + Param.ChannelCount := 1; + Param.SampleFormat := paFloat32; + param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency; + param.HostApiSpecificStreamInfo := nil; + + Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate); + + + { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. } + Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag, + PPaStreamCallback(@PlayCallback), nil); + + if Status <> paNoError then + begin + WriteLn('Failed to open stream, ', Pa_GetErrorText(Status)); + Pa_Terminate; + Exit; + end; + + InitCriticalSection(CriticalSection); + + Status := Pa_StartStream(stream); + if Status <> paNoError then + begin + WriteLn('Failed to start stream, ', Pa_GetErrorText(Status)); + Pa_Terminate; + Exit; + end; + + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); + + Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'; + + Audio := Tts.Generate(Text, SpeakerId, Speed, + PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil); + FinishedGeneration := True; + SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate); + WriteLn('Saved to ./libritts_r-generated.wav'); + + while not FinishedPlaying do + Pa_Sleep(100); {sleep for 0.1 second } + {TODO(fangjun): Use an event to indicate the play is finished} + + DoneCriticalSection(CriticalSection); + + FreeAndNil(Tts); + FreeAndNil(Resampler); + + Status := Pa_CloseStream(stream); + if Status <> paNoError then + begin + WriteLn('Failed to close stream, ', Pa_GetErrorText(Status)); + Exit; + end; + + Status := Pa_Terminate; + if Status <> paNoError then + begin + WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status)); + Exit; + end; +end. + diff --git a/pascal-api-examples/tts/piper.pas b/pascal-api-examples/tts/piper.pas new file mode 100644 index 000000000..7c3fb4e66 --- /dev/null +++ b/pascal-api-examples/tts/piper.pas @@ -0,0 +1,54 @@ +{ Copyright (c) 2024 Xiaomi Corporation } +program piper; +{ +This file shows how to use the text to speech API of sherpa-onnx +with Piper models. + +It generates speech from text and saves it to a wave file. + +If you want to play it while it is generating, please see +./piper-playback.pas +} + +{$mode objfpc} + +uses + SysUtils, + sherpa_onnx; + +function GetOfflineTts: TSherpaOnnxOfflineTts; +var + Config: TSherpaOnnxOfflineTtsConfig; +begin + Config.Model.Vits.Model := './vits-piper-en_US-libritts_r-medium/en_US-libritts_r-medium.onnx'; + Config.Model.Vits.Tokens := './vits-piper-en_US-libritts_r-medium/tokens.txt'; + Config.Model.Vits.DataDir := './vits-piper-en_US-libritts_r-medium/espeak-ng-data'; + Config.Model.NumThreads := 1; + Config.Model.Debug := False; + Config.MaxNumSentences := 1; + + Result := TSherpaOnnxOfflineTts.Create(Config); +end; + +var + Tts: TSherpaOnnxOfflineTts; + Audio: TSherpaOnnxGeneratedAudio; + + Text: AnsiString; + Speed: Single = 1.0; {Use a larger value to speak faster} + SpeakerId: Integer = 0; + +begin + Tts := GetOfflineTts; + + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); + + Text := 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'; + + Audio := Tts.Generate(Text, SpeakerId, Speed); + SherpaOnnxWriteWave('./libritts_r-generated.wav', Audio.Samples, Audio.SampleRate); + WriteLn('Saved to ./libritts_r-generated.wav'); + + FreeAndNil(Tts); +end. + diff --git a/pascal-api-examples/tts/run-piper-playback.sh b/pascal-api-examples/tts/run-piper-playback.sh new file mode 100755 index 000000000..7076dee05 --- /dev/null +++ b/pascal-api-examples/tts/run-piper-playback.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +if [[ ! -f ./vits-piper-en_US-libritts_r-medium/tokens.txt ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2 + tar xf vits-piper-en_US-libritts_r-medium.tar.bz2 + rm vits-piper-en_US-libritts_r-medium.tar.bz2 +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + -Fl/usr/local/Cellar/portaudio/19.7.0/lib \ + ./piper-playback.pas + +# Please see ../portaudio-test/README.md +# for how to install portaudio on macOS + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./piper-playback diff --git a/pascal-api-examples/tts/run-piper.sh b/pascal-api-examples/tts/run-piper.sh new file mode 100755 index 000000000..7abed25ee --- /dev/null +++ b/pascal-api-examples/tts/run-piper.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +if [[ ! -f ./vits-piper-en_US-libritts_r-medium/tokens.txt ]]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-libritts_r-medium.tar.bz2 + tar xf vits-piper-en_US-libritts_r-medium.tar.bz2 + rm vits-piper-en_US-libritts_r-medium.tar.bz2 +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./piper.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./piper diff --git a/scripts/apk/generate-tts-apk-script.py b/scripts/apk/generate-tts-apk-script.py index 48745c312..291c4738a 100755 --- a/scripts/apk/generate-tts-apk-script.py +++ b/scripts/apk/generate-tts-apk-script.py @@ -190,9 +190,9 @@ def get_piper_models() -> List[TtsModel]: TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"), TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"), TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"), - TtsModel(model_dir="vits-piper-nl_NL-mls-medium"), - TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"), - TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"), + # TtsModel(model_dir="vits-piper-nl_NL-mls-medium"), + # TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"), + # TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"), TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"), TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"), TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"), diff --git a/scripts/flutter/generate-tts.py b/scripts/flutter/generate-tts.py index 380d1f453..3ca5a98fd 100755 --- a/scripts/flutter/generate-tts.py +++ b/scripts/flutter/generate-tts.py @@ -180,9 +180,9 @@ def get_piper_models() -> List[TtsModel]: TtsModel(model_dir="vits-piper-nl_BE-nathalie-x_low"), TtsModel(model_dir="vits-piper-nl_BE-rdh-medium"), TtsModel(model_dir="vits-piper-nl_BE-rdh-x_low"), - TtsModel(model_dir="vits-piper-nl_NL-mls-medium"), - TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"), - TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"), + # TtsModel(model_dir="vits-piper-nl_NL-mls-medium"), + # TtsModel(model_dir="vits-piper-nl_NL-mls_5809-low"), + # TtsModel(model_dir="vits-piper-nl_NL-mls_7432-low"), TtsModel(model_dir="vits-piper-no_NO-talesyntese-medium"), TtsModel(model_dir="vits-piper-pl_PL-darkman-medium"), TtsModel(model_dir="vits-piper-pl_PL-gosia-medium"), diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 46da81ca5..e01ae0478 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -18,6 +18,7 @@ #include "sherpa-onnx/csrc/offline-punctuation.h" #include "sherpa-onnx/csrc/offline-recognizer.h" #include "sherpa-onnx/csrc/online-recognizer.h" +#include "sherpa-onnx/csrc/resample.h" #include "sherpa-onnx/csrc/speaker-embedding-extractor.h" #include "sherpa-onnx/csrc/speaker-embedding-manager.h" #include "sherpa-onnx/csrc/spoken-language-identification.h" @@ -1584,3 +1585,56 @@ const char *SherpaOfflinePunctuationAddPunct( } void SherpaOfflinePunctuationFreeText(const char *text) { delete[] text; } + +struct SherpaOnnxLinearResampler { + std::unique_ptr impl; +}; + +SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler( + int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz, + int32_t num_zeros) { + SherpaOnnxLinearResampler *p = new SherpaOnnxLinearResampler; + p->impl = std::make_unique( + samp_rate_in_hz, samp_rate_out_hz, filter_cutoff_hz, num_zeros); + + return p; +} + +void SherpaOnnxDestroyLinearResampler(SherpaOnnxLinearResampler *p) { + delete p; +} + +const SherpaOnnxResampleOut *SherpaOnnxLinearResamplerResample( + SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim, + int32_t flush) { + std::vector o; + p->impl->Resample(input, input_dim, flush, &o); + + float *s = new float[o.size()]; + std::copy(o.begin(), o.end(), s); + + SherpaOnnxResampleOut *ans = new SherpaOnnxResampleOut; + ans->samples = s; + ans->n = static_cast(o.size()); + + return ans; +} + +void SherpaOnnxLinearResamplerResampleFree(const SherpaOnnxResampleOut *p) { + delete[] p->samples; + delete p; +} + +int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate( + const SherpaOnnxLinearResampler *p) { + return p->impl->GetInputSamplingRate(); +} + +int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( + const SherpaOnnxLinearResampler *p) { + return p->impl->GetOutputSamplingRate(); +} + +void SherpaOnnxLinearResamplerReset(SherpaOnnxLinearResampler *p) { + p->impl->Reset(); +} diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 9ddabba00..97b8d8081 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -1315,6 +1315,52 @@ SHERPA_ONNX_API const char *SherpaOfflinePunctuationAddPunct( SHERPA_ONNX_API void SherpaOfflinePunctuationFreeText(const char *text); +// for resampling +SHERPA_ONNX_API typedef struct SherpaOnnxLinearResampler + SherpaOnnxLinearResampler; + +/* + float min_freq = min(sampling_rate_in_hz, samp_rate_out_hz); + float lowpass_cutoff = 0.99 * 0.5 * min_freq; + int32_t lowpass_filter_width = 6; + + You can set filter_cutoff_hz to lowpass_cutoff + sand set num_zeros to lowpass_filter_width +*/ +// The user has to invoke SherpaOnnxDestroyLinearResampler() +// to free the returned pointer to avoid memory leak +SHERPA_ONNX_API SherpaOnnxLinearResampler *SherpaOnnxCreateLinearResampler( + int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, float filter_cutoff_hz, + int32_t num_zeros); + +SHERPA_ONNX_API void SherpaOnnxDestroyLinearResampler( + SherpaOnnxLinearResampler *p); + +SHERPA_ONNX_API void SherpaOnnxLinearResamplerReset( + SherpaOnnxLinearResampler *p); + +typedef struct SherpaOnnxResampleOut { + const float *samples; + int32_t n; +} SherpaOnnxResampleOut; +// The user has to invoke SherpaOnnxLinearResamplerResampleFree() +// to free the returned pointer to avoid memory leak. +// +// If this is the last segment, you can set flush to 1; otherwise, please +// set flush to 0 +SHERPA_ONNX_API const SherpaOnnxResampleOut *SherpaOnnxLinearResamplerResample( + SherpaOnnxLinearResampler *p, const float *input, int32_t input_dim, + int32_t flush); + +SHERPA_ONNX_API void SherpaOnnxLinearResamplerResampleFree( + const SherpaOnnxResampleOut *p); + +SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetInputSampleRate( + const SherpaOnnxLinearResampler *p); + +SHERPA_ONNX_API int32_t SherpaOnnxLinearResamplerResampleGetOutputSampleRate( + const SherpaOnnxLinearResampler *p); + #if defined(__GNUC__) #pragma GCC diagnostic pop #endif diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index 603f3d14d..dc0684ebc 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -1,4 +1,9 @@ -{ Copyright (c) 2024 Xiaomi Corporation } +{ Copyright (c) 2024 Xiaomi Corporation + +Please see +https://github.com/k2-fsa/sherpa-onnx/tree/master/pascal-api-examples +for how to use APIs in this file. +} unit sherpa_onnx; @@ -7,13 +12,105 @@ {$modeSwitch advancedRecords} { to support records with methods } {$ENDIF} -(* {$LongStrings ON} *) +{$LongStrings ON} interface uses ctypes; type + TSherpaOnnxSamplesArray = array of Single; + + TSherpaOnnxLinearResampler = class + private + Handle: Pointer; + InputSampleRate: Integer; + OutputSampleRate: Integer; + public + constructor Create(SampleRateIn: Integer; SampleRateOut: Integer); + destructor Destroy; override; + + function Resample(Samples: pcfloat; + N: Integer; Flush: Boolean): TSherpaOnnxSamplesArray; overload; + + function Resample(Samples: array of Single; + Flush: Boolean): TSherpaOnnxSamplesArray; overload; + + procedure Reset; + + property GetInputSampleRate: Integer Read InputSampleRate; + property GetOutputSampleRate: Integer Read OutputSampleRate; + end; + + PSherpaOnnxGeneratedAudioCallbackWithArg = ^TSherpaOnnxGeneratedAudioCallbackWithArg; + + TSherpaOnnxGeneratedAudioCallbackWithArg = function( + Samples: pcfloat; N: cint32; + Arg: Pointer): cint; cdecl; + + TSherpaOnnxOfflineTtsVitsModelConfig = record + Model: AnsiString; + Lexicon: AnsiString; + Tokens: AnsiString; + DataDir: AnsiString; + NoiseScale: Single; + NoiseScaleW: Single; + LengthScale: Single; + DictDir: AnsiString; + + function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig); + end; + + TSherpaOnnxOfflineTtsModelConfig = record + Vits: TSherpaOnnxOfflineTtsVitsModelConfig; + NumThreads: Integer; + Debug: Boolean; + Provider: AnsiString; + + function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig); + end; + + TSherpaOnnxOfflineTtsConfig = record + Model: TSherpaOnnxOfflineTtsModelConfig; + RuleFsts: AnsiString; + MaxNumSentences: Integer; + RuleFars: AnsiString; + + function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig); + end; + + TSherpaOnnxGeneratedAudio = record + Samples: array of Single; + SampleRate: Integer; + end; + + TSherpaOnnxOfflineTts = class + private + Handle: Pointer; + SampleRate: Integer; + NumSpeakers: Integer; + _Config: TSherpaOnnxOfflineTtsConfig; + public + constructor Create(Config: TSherpaOnnxOfflineTtsConfig); + destructor Destroy; override; + + function Generate(Text: AnsiString; SpeakerId: Integer; + Speed: Single): TSherpaOnnxGeneratedAudio; overload; + + function Generate(Text: AnsiString; SpeakerId: Integer; + Speed: Single; + Callback:PSherpaOnnxGeneratedAudioCallbackWithArg; + Arg: Pointer + ): TSherpaOnnxGeneratedAudio; overload; + + property GetHandle: Pointer Read Handle; + property GetSampleRate: Integer Read SampleRate; + property GetNumSpeakers: Integer Read NumSpeakers; + end; + TSherpaOnnxWave = record Samples: array of Single; { normalized to the range [-1, 1] } SampleRate: Integer; @@ -254,7 +351,6 @@ TSherpaOnnxVadModelConfig = record class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxVadModelConfig); end; - TSherpaOnnxSamplesArray = array of Single; TSherpaOnnxCircularBuffer = class private @@ -508,6 +604,94 @@ SherpaOnnxSpeechSegment = record PSherpaOnnxSpeechSegment = ^SherpaOnnxSpeechSegment; + SherpaOnnxOfflineTtsVitsModelConfig = record + Model: PAnsiChar; + Lexicon: PAnsiChar; + Tokens: PAnsiChar; + DataDir: PAnsiChar; + NoiseScale: cfloat; + NoiseScaleW: cfloat; + LengthScale: cfloat; + DictDir: PAnsiChar; + end; + + SherpaOnnxOfflineTtsModelConfig = record + Vits: SherpaOnnxOfflineTtsVitsModelConfig; + NumThreads: cint32; + Debug: cint32; + Provider: PAnsiChar; + end; + + SherpaOnnxOfflineTtsConfig = record + Model: SherpaOnnxOfflineTtsModelConfig; + RuleFsts: PAnsiChar; + MaxNumSentences: cint32; + RuleFars: PAnsiChar; + end; + + PSherpaOnnxOfflineTtsConfig = ^SherpaOnnxOfflineTtsConfig; + + SherpaOnnxGeneratedAudio = record + Samples: pcfloat; + N: cint32; + SampleRate: cint32; + end; + + PSherpaOnnxGeneratedAudio = ^SherpaOnnxGeneratedAudio; + + SherpaOnnxResampleOut = record + Samples: pcfloat; + N: cint32; + end; + + PSherpaOnnxResampleOut = ^SherpaOnnxResampleOut; + +function SherpaOnnxCreateLinearResampler(SampleRateInHz: cint32; + SampleRateOutHz: cint32; + FilterCutoffHz: cfloat; + NumZeros: cint32): Pointer; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxDestroyLinearResampler(P: Pointer); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxLinearResamplerResample(P: Pointer; + Samples: pcfloat; + N: Integer; + Flush: Integer): PSherpaOnnxResampleOut; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxLinearResamplerResampleFree(P: PSherpaOnnxResampleOut); cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxLinearResamplerReset(P: Pointer); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxCreateOfflineTts(Config: PSherpaOnnxOfflineTtsConfig): Pointer; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxDestroyOfflineTts(Tts: Pointer); cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxOfflineTtsSampleRate(Tts: Pointer): cint32; cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxOfflineTtsNumSpeakers(Tts: Pointer): cint32; cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxOfflineTtsGenerate(Tts: Pointer; + Text: PAnsiChar; Sid: cint32; Speed: cfloat): PSherpaOnnxGeneratedAudio; cdecl; + external SherpaOnnxLibName; + +function SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(Tts: Pointer; + Text: PAnsiChar; Sid: cint32; Speed: cfloat; + Callback: PSherpaOnnxGeneratedAudioCallbackWithArg; + Arg: Pointer): PSherpaOnnxGeneratedAudio; cdecl; + external SherpaOnnxLibName; + +procedure SherpaOnnxDestroyOfflineTtsGeneratedAudio(Audio: Pointer); cdecl; + external SherpaOnnxLibName; + function SherpaOnnxCreateVoiceActivityDetector(Config: PSherpaOnnxVadModelConfig; BufferSizeInSeconds: cfloat): Pointer; cdecl; external SherpaOnnxLibName; @@ -793,8 +977,7 @@ constructor TSherpaOnnxOnlineRecognizer.Create(Config: TSherpaOnnxOnlineRecogniz var C: SherpaOnnxOnlineRecognizerConfig; begin - Initialize(C); - + C := Default(SherpaOnnxOnlineRecognizerConfig); C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate; C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim; @@ -1051,8 +1234,7 @@ constructor TSherpaOnnxOfflineRecognizer.Create(Config: TSherpaOnnxOfflineRecogn var C: SherpaOnnxOfflineRecognizerConfig; begin - Initialize(C); - + C := Default(SherpaOnnxOfflineRecognizerConfig); C.FeatConfig.SampleRate := Config.FeatConfig.SampleRate; C.FeatConfig.FeatureDim := Config.FeatConfig.FeatureDim; @@ -1369,12 +1551,11 @@ function TSherpaOnnxCircularBuffer.Head: Integer; constructor TSherpaOnnxVoiceActivityDetector.Create(Config: TSherpaOnnxVadModelConfig; BufferSizeInSeconds: Single); var - C: SherpaOnnxVadModelConfig; + C: SherpaOnnxVadModelConfig ; begin + C := Default(SherpaOnnxVadModelConfig); Self._Config := Config; - Initialize(C); - C.SileroVad.Model := PAnsiChar(Config.SileroVad.Model); C.SileroVad.Threshold := Config.SileroVad.Threshold; C.SileroVad.MinSilenceDuration := Config.SileroVad.MinSilenceDuration; @@ -1460,5 +1641,197 @@ procedure TSherpaOnnxVoiceActivityDetector.Flush; SherpaOnnxVoiceActivityDetectorFlush(Self.Handle); end; -end. +function TSherpaOnnxOfflineTtsVitsModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineTtsVitsModelConfig(' + + 'Model := %s, ' + + 'Lexicon := %s, ' + + 'Tokens := %s, ' + + 'DataDir := %s, ' + + 'NoiseScale := %.2f, ' + + 'NoiseScaleW := %.2f, ' + + 'LengthScale := %.2f, ' + + 'DictDir := %s' + + ')', + [Self.Model, Self.Lexicon, Self.Tokens, Self.DataDir, Self.NoiseScale, + Self.NoiseScaleW, Self.LengthScale, Self.DictDir + ]); +end; + +class operator TSherpaOnnxOfflineTtsVitsModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig); +begin + Dest.NoiseScale := 0.667; + Dest.NoiseScaleW := 0.8; + Dest.LengthScale := 1.0; +end; + +function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineTtsModelConfig(' + + 'Vits := %s, ' + + 'NumThreads := %d, ' + + 'Debug := %s, ' + + 'Provider := %s' + + ')', + [Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider + ]); +end; + +class operator TSherpaOnnxOfflineTtsModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig); +begin + Dest.NumThreads := 1; + Dest.Debug := False; + Dest.Provider := 'cpu'; +end; + +function TSherpaOnnxOfflineTtsConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineTtsConfig(' + + 'Model := %s, ' + + 'RuleFsts := %s, ' + + 'MaxNumSentences := %d, ' + + 'RuleFars := %s' + + ')', + [Self.Model.ToString, Self.RuleFsts, Self.MaxNumSentences, Self.RuleFars + ]); +end; + +class operator TSherpaOnnxOfflineTtsConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsConfig); +begin + Dest.MaxNumSentences := 1; +end; + +constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig); +var + C: SherpaOnnxOfflineTtsConfig; +begin + C := Default(SherpaOnnxOfflineTtsConfig); + Self._Config := Config; + + C.Model.Vits.Model := PAnsiChar(Config.Model.Vits.Model); + C.Model.Vits.Lexicon := PAnsiChar(Config.Model.Vits.Lexicon); + C.Model.Vits.Tokens := PAnsiChar(Config.Model.Vits.Tokens); + C.Model.Vits.DataDir := PAnsiChar(Config.Model.Vits.DataDir); + C.Model.Vits.NoiseScale := Config.Model.Vits.NoiseScale; + C.Model.Vits.NoiseScaleW := Config.Model.Vits.NoiseScaleW; + C.Model.Vits.LengthScale := Config.Model.Vits.LengthScale; + C.Model.Vits.DictDir := PAnsiChar(Config.Model.Vits.DictDir); + + C.Model.NumThreads := Config.Model.NumThreads; + C.Model.Provider := PAnsiChar(Config.Model.Provider); + C.Model.Debug := Ord(Config.Model.Debug); + + C.RuleFsts := PAnsiChar(Config.RuleFsts); + C.MaxNumSentences := Config.MaxNumSentences; + C.RuleFars := PAnsiChar(Config.RuleFars); + + Self.Handle := SherpaOnnxCreateOfflineTts(@C); + + Self.SampleRate := SherpaOnnxOfflineTtsSampleRate(Self.Handle); + Self.NumSpeakers := SherpaOnnxOfflineTtsNumSpeakers(Self.Handle); +end; + +destructor TSherpaOnnxOfflineTts.Destroy; +begin + SherpaOnnxDestroyOfflineTts(Self.Handle); + Self.Handle := nil; +end; + +function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer; + Speed: Single): TSherpaOnnxGeneratedAudio; +var + Audio: PSherpaOnnxGeneratedAudio; + I: Integer; +begin + Result := Default(TSherpaOnnxGeneratedAudio); + + Audio := SherpaOnnxOfflineTtsGenerate(Self.Handle, PAnsiChar(Text), SpeakerId, Speed); + + SetLength(Result.Samples, Audio^.N); + Result.SampleRate := Audio^.SampleRate; + + for I := Low(Result.Samples) to High(Result.Samples) do + begin + Result.Samples[I] := Audio^.Samples[I]; + end; + + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); +end; + +function TSherpaOnnxOfflineTts.Generate(Text: AnsiString; SpeakerId: Integer; + Speed: Single; + Callback:PSherpaOnnxGeneratedAudioCallbackWithArg; + Arg: Pointer + ): TSherpaOnnxGeneratedAudio; +var + Audio: PSherpaOnnxGeneratedAudio; + I: Integer; +begin + Result := Default(TSherpaOnnxGeneratedAudio); + + Audio := SherpaOnnxOfflineTtsGenerateWithCallbackWithArg(Self.Handle, PAnsiChar(Text), + SpeakerId, Speed, Callback, Arg); + + SetLength(Result.Samples, Audio^.N); + Result.SampleRate := Audio^.SampleRate; + + for I := Low(Result.Samples) to High(Result.Samples) do + begin + Result.Samples[I] := Audio^.Samples[I]; + end; + + SherpaOnnxDestroyOfflineTtsGeneratedAudio(audio); +end; +constructor TSherpaOnnxLinearResampler.Create(SampleRateIn: Integer; SampleRateOut: Integer); +var + MinFreq: Single; + LowpassCutoff: Single; + LowpassFilterWidth: Integer = 6; +begin + if SampleRateIn > SampleRateOut then + MinFreq := SampleRateOut + else + MinFreq := SampleRateIn; + + LowpassCutoff := 0.99 * 0.5 * MinFreq; + + Self.Handle := SherpaOnnxCreateLinearResampler(SampleRateIn, + SampleRateOut, LowpassCutoff, LowpassFilterWidth); + Self.InputSampleRate := SampleRateIn; + Self.OutputSampleRate := SampleRateOut; +end; + +destructor TSherpaOnnxLinearResampler.Destroy; +begin + SherpaOnnxDestroyLinearResampler(Self.Handle); + Self.Handle := nil; +end; + +function TSherpaOnnxLinearResampler.Resample(Samples: pcfloat; + N: Integer; Flush: Boolean): TSherpaOnnxSamplesArray; +var + P: PSherpaOnnxResampleOut; + I: Integer; +begin + Result := Default(TSherpaOnnxSamplesArray); + P := SherpaOnnxLinearResamplerResample(Self.Handle, Samples, N, Ord(Flush)); + SetLength(Result, P^.N); + + for I := Low(Result) to High(Result) do + Result[I] := P^.Samples[I]; + + SherpaOnnxLinearResamplerResampleFree(P); +end; + +function TSherpaOnnxLinearResampler.Resample(Samples: array of Single; Flush: Boolean): TSherpaOnnxSamplesArray; +begin + Result := Self.Resample(pcfloat(Samples), Length(Samples), Flush); +end; + +procedure TSherpaOnnxLinearResampler.Reset; +begin + SherpaOnnxLinearResamplerReset(Self.Handle); +end; + +end.