From c6fcd32552d754a21721045b0937696fb9c38da1 Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 6 Jan 2025 10:04:35 +0800 Subject: [PATCH] Add Pascal API for MatchaTTS models. (#1686) --- .github/workflows/pascal.yaml | 13 + pascal-api-examples/tts/.gitignore | 4 + .../tts/matcha-en-playback.pas | 239 +++++++++++++++++ pascal-api-examples/tts/matcha-en.pas | 55 ++++ .../tts/matcha-zh-playback.pas | 241 ++++++++++++++++++ pascal-api-examples/tts/matcha-zh.pas | 57 +++++ pascal-api-examples/tts/piper-playback.pas | 2 +- .../tts/run-matcha-en-playback.sh | 53 ++++ pascal-api-examples/tts/run-matcha-en.sh | 49 ++++ .../tts/run-matcha-zh-playback.sh | 52 ++++ pascal-api-examples/tts/run-matcha-zh.sh | 48 ++++ sherpa-onnx/pascal-api/sherpa_onnx.pas | 65 ++++- 12 files changed, 875 insertions(+), 3 deletions(-) create mode 100644 pascal-api-examples/tts/matcha-en-playback.pas create mode 100644 pascal-api-examples/tts/matcha-en.pas create mode 100644 pascal-api-examples/tts/matcha-zh-playback.pas create mode 100644 pascal-api-examples/tts/matcha-zh.pas create mode 100755 pascal-api-examples/tts/run-matcha-en-playback.sh create mode 100755 pascal-api-examples/tts/run-matcha-en.sh create mode 100755 pascal-api-examples/tts/run-matcha-zh-playback.sh create mode 100755 pascal-api-examples/tts/run-matcha-zh.sh diff --git a/.github/workflows/pascal.yaml b/.github/workflows/pascal.yaml index 306ae6480..5193241b6 100644 --- a/.github/workflows/pascal.yaml +++ b/.github/workflows/pascal.yaml @@ -152,6 +152,19 @@ jobs: ./run-piper.sh rm -rf vits-piper-* + rm piper + ls -lh + echo "---" + + ./run-matcha-zh.sh + rm -rf matcha-icefall-* + rm matcha-zh + ls -lh + echo "---" + + ./run-matcha-en.sh + rm -rf matcha-icefall-* + rm matcha-en ls -lh echo "---" diff --git a/pascal-api-examples/tts/.gitignore b/pascal-api-examples/tts/.gitignore index b7076ab5c..c7d282825 100644 --- a/pascal-api-examples/tts/.gitignore +++ b/pascal-api-examples/tts/.gitignore @@ -2,3 +2,7 @@ piper piper-playback link*.res +matcha-zh +matcha-en +matcha-zh-playback +matcha-en-playback diff --git a/pascal-api-examples/tts/matcha-en-playback.pas b/pascal-api-examples/tts/matcha-en-playback.pas new file mode 100644 index 000000000..e750099cb --- /dev/null +++ b/pascal-api-examples/tts/matcha-en-playback.pas @@ -0,0 +1,239 @@ +{ Copyright (c) 2025 Xiaomi Corporation } +program matcha_en_playback; +{ +This file shows how to use the text to speech API of sherpa-onnx +with Piper models. + +It generates speech from text and saves it to a wave file. + +Note that it plays the audio back as it is still generating. +} + +{$mode objfpc} + +uses + {$ifdef unix} + cthreads, + {$endif} + SysUtils, + dos, + ctypes, + portaudio, + sherpa_onnx; + +var + CriticalSection: TRTLCriticalSection; + + Tts: TSherpaOnnxOfflineTts; + Audio: TSherpaOnnxGeneratedAudio; + Resampler: TSherpaOnnxLinearResampler; + + Text: AnsiString; + Speed: Single = 1.0; {Use a larger value to speak faster} + SpeakerId: Integer = 0; + Buffer: TSherpaOnnxCircularBuffer; + FinishedGeneration: Boolean = False; + FinishedPlaying: Boolean = False; + + Version: String; + EnvStr: String; + Status: Integer; + NumDevices: Integer; + DeviceIndex: Integer; + DeviceInfo: PPaDeviceInfo; + + { If you get EDivByZero: Division by zero error, please change the sample rate + to the one supported by your microphone. + } + DeviceSampleRate: Integer = 48000; + I: Integer; + Param: TPaStreamParameters; + Stream: PPaStream; + Wave: TSherpaOnnxWave; + +function GenerateCallback( + Samples: pcfloat; N: cint32; + Arg: Pointer): cint; cdecl; +begin + EnterCriticalSection(CriticalSection); + try + if Resampler <> nil then + Buffer.Push(Resampler.Resample(Samples, N, False)) + else + Buffer.Push(Samples, N); + finally + LeaveCriticalSection(CriticalSection); + end; + + { 1 means to continue generating; 0 means to stop generating. } + Result := 1; +end; + +function PlayCallback( + input: Pointer; output: Pointer; + frameCount: culong; + timeInfo: PPaStreamCallbackTimeInfo; + statusFlags: TPaStreamCallbackFlags; + userData: Pointer ): cint; cdecl; +var + Samples: TSherpaOnnxSamplesArray; + I: Integer; +begin + EnterCriticalSection(CriticalSection); + try + if Buffer.Size >= frameCount then + begin + Samples := Buffer.Get(Buffer.Head, FrameCount); + Buffer.Pop(FrameCount); + end + else if Buffer.Size > 0 then + begin + Samples := Buffer.Get(Buffer.Head, Buffer.Size); + Buffer.Pop(Buffer.Size); + SetLength(Samples, frameCount); + end + else + SetLength(Samples, frameCount); + + for I := 0 to frameCount - 1 do + pcfloat(output)[I] := Samples[I]; + + if (Buffer.Size > 0) or (not FinishedGeneration) then + Result := paContinue + else + begin + Result := paComplete; + FinishedPlaying := True; + end; + finally + LeaveCriticalSection(CriticalSection); + end; +end; + +function GetOfflineTts: TSherpaOnnxOfflineTts; +var + Config: TSherpaOnnxOfflineTtsConfig; +begin + Config.Model.Matcha.AcousticModel := './matcha-icefall-en_US-ljspeech/model-steps-3.onnx'; + Config.Model.Matcha.Vocoder := './hifigan_v2.onnx'; + Config.Model.Matcha.Tokens := './matcha-icefall-en_US-ljspeech/tokens.txt'; + Config.Model.Matcha.DataDir := './matcha-icefall-en_US-ljspeech/espeak-ng-data'; + Config.Model.NumThreads := 1; + Config.Model.Debug := False; + Config.MaxNumSentences := 1; + + Result := TSherpaOnnxOfflineTts.Create(Config); +end; + +begin + Tts := GetOfflineTts; + if Tts.GetSampleRate <> DeviceSampleRate then + Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate); + + Version := String(Pa_GetVersionText); + WriteLn('Version is ', Version); + Status := Pa_Initialize; + if Status <> paNoError then + begin + WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status)); + Exit; + end; + + NumDevices := Pa_GetDeviceCount; + WriteLn('Num devices: ', NumDevices); + + DeviceIndex := Pa_GetDefaultOutputDevice; + + if DeviceIndex = paNoDevice then + begin + WriteLn('No default output device found'); + Pa_Terminate; + Exit; + end; + + EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE'); + if EnvStr <> '' then + begin + DeviceIndex := StrToIntDef(EnvStr, DeviceIndex); + WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr); + end; + + for I := 0 to (NumDevices - 1) do + begin + DeviceInfo := Pa_GetDeviceInfo(I); + if I = DeviceIndex then + { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) } + WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)])) + else + WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)])); + end; + + WriteLn('Use device ', DeviceIndex); + WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name); + WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels); + + Initialize(Param); + Param.Device := DeviceIndex; + Param.ChannelCount := 1; + Param.SampleFormat := paFloat32; + param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency; + param.HostApiSpecificStreamInfo := nil; + + Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate); + + + { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. } + Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag, + PPaStreamCallback(@PlayCallback), nil); + + if Status <> paNoError then + begin + WriteLn('Failed to open stream, ', Pa_GetErrorText(Status)); + Pa_Terminate; + Exit; + end; + + InitCriticalSection(CriticalSection); + + Status := Pa_StartStream(stream); + if Status <> paNoError then + begin + WriteLn('Failed to start stream, ', Pa_GetErrorText(Status)); + Pa_Terminate; + Exit; + end; + + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); + + Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.'; + + Audio := Tts.Generate(Text, SpeakerId, Speed, + PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil); + FinishedGeneration := True; + SherpaOnnxWriteWave('./matcha-zh-playback.wav', Audio.Samples, Audio.SampleRate); + WriteLn('Saved to ./matcha-zh-playback.wav'); + + while not FinishedPlaying do + Pa_Sleep(100); {sleep for 0.1 second } + {TODO(fangjun): Use an event to indicate the play is finished} + + DoneCriticalSection(CriticalSection); + + FreeAndNil(Tts); + FreeAndNil(Resampler); + + Status := Pa_CloseStream(stream); + if Status <> paNoError then + begin + WriteLn('Failed to close stream, ', Pa_GetErrorText(Status)); + Exit; + end; + + Status := Pa_Terminate; + if Status <> paNoError then + begin + WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status)); + Exit; + end; +end. + diff --git a/pascal-api-examples/tts/matcha-en.pas b/pascal-api-examples/tts/matcha-en.pas new file mode 100644 index 000000000..7ef34b703 --- /dev/null +++ b/pascal-api-examples/tts/matcha-en.pas @@ -0,0 +1,55 @@ +{ Copyright (c) 2025 Xiaomi Corporation } +program matcha_en; +{ +This file shows how to use the text to speech API of sherpa-onnx +with MatchaTTS models. + +It generates speech from text and saves it to a wave file. + +If you want to play it while it is generating, please see +./matcha-zh-playback.pas +} + +{$mode objfpc} + +uses + SysUtils, + sherpa_onnx; + +function GetOfflineTts: TSherpaOnnxOfflineTts; +var + Config: TSherpaOnnxOfflineTtsConfig; +begin + Config.Model.Matcha.AcousticModel := './matcha-icefall-en_US-ljspeech/model-steps-3.onnx'; + Config.Model.Matcha.Vocoder := './hifigan_v2.onnx'; + Config.Model.Matcha.Tokens := './matcha-icefall-en_US-ljspeech/tokens.txt'; + Config.Model.Matcha.DataDir := './matcha-icefall-en_US-ljspeech/espeak-ng-data'; + Config.Model.NumThreads := 1; + Config.Model.Debug := False; + Config.MaxNumSentences := 1; + + Result := TSherpaOnnxOfflineTts.Create(Config); +end; + +var + Tts: TSherpaOnnxOfflineTts; + Audio: TSherpaOnnxGeneratedAudio; + + Text: AnsiString; + Speed: Single = 1.0; {Use a larger value to speak faster} + SpeakerId: Integer = 0; + +begin + Tts := GetOfflineTts; + + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); + + Text := 'Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone.'; + + Audio := Tts.Generate(Text, SpeakerId, Speed); + SherpaOnnxWriteWave('./matcha-en.wav', Audio.Samples, Audio.SampleRate); + WriteLn('Saved to ./matcha-en.wav'); + + FreeAndNil(Tts); +end. + diff --git a/pascal-api-examples/tts/matcha-zh-playback.pas b/pascal-api-examples/tts/matcha-zh-playback.pas new file mode 100644 index 000000000..08b2bbe2d --- /dev/null +++ b/pascal-api-examples/tts/matcha-zh-playback.pas @@ -0,0 +1,241 @@ +{ Copyright (c) 2025 Xiaomi Corporation } +program matcha_zh_playback; +{ +This file shows how to use the text to speech API of sherpa-onnx +with Piper models. + +It generates speech from text and saves it to a wave file. + +Note that it plays the audio back as it is still generating. +} + +{$mode objfpc} + +uses + {$ifdef unix} + cthreads, + {$endif} + SysUtils, + dos, + ctypes, + portaudio, + sherpa_onnx; + +var + CriticalSection: TRTLCriticalSection; + + Tts: TSherpaOnnxOfflineTts; + Audio: TSherpaOnnxGeneratedAudio; + Resampler: TSherpaOnnxLinearResampler; + + Text: AnsiString; + Speed: Single = 1.0; {Use a larger value to speak faster} + SpeakerId: Integer = 0; + Buffer: TSherpaOnnxCircularBuffer; + FinishedGeneration: Boolean = False; + FinishedPlaying: Boolean = False; + + Version: String; + EnvStr: String; + Status: Integer; + NumDevices: Integer; + DeviceIndex: Integer; + DeviceInfo: PPaDeviceInfo; + + { If you get EDivByZero: Division by zero error, please change the sample rate + to the one supported by your microphone. + } + DeviceSampleRate: Integer = 48000; + I: Integer; + Param: TPaStreamParameters; + Stream: PPaStream; + Wave: TSherpaOnnxWave; + +function GenerateCallback( + Samples: pcfloat; N: cint32; + Arg: Pointer): cint; cdecl; +begin + EnterCriticalSection(CriticalSection); + try + if Resampler <> nil then + Buffer.Push(Resampler.Resample(Samples, N, False)) + else + Buffer.Push(Samples, N); + finally + LeaveCriticalSection(CriticalSection); + end; + + { 1 means to continue generating; 0 means to stop generating. } + Result := 1; +end; + +function PlayCallback( + input: Pointer; output: Pointer; + frameCount: culong; + timeInfo: PPaStreamCallbackTimeInfo; + statusFlags: TPaStreamCallbackFlags; + userData: Pointer ): cint; cdecl; +var + Samples: TSherpaOnnxSamplesArray; + I: Integer; +begin + EnterCriticalSection(CriticalSection); + try + if Buffer.Size >= frameCount then + begin + Samples := Buffer.Get(Buffer.Head, FrameCount); + Buffer.Pop(FrameCount); + end + else if Buffer.Size > 0 then + begin + Samples := Buffer.Get(Buffer.Head, Buffer.Size); + Buffer.Pop(Buffer.Size); + SetLength(Samples, frameCount); + end + else + SetLength(Samples, frameCount); + + for I := 0 to frameCount - 1 do + pcfloat(output)[I] := Samples[I]; + + if (Buffer.Size > 0) or (not FinishedGeneration) then + Result := paContinue + else + begin + Result := paComplete; + FinishedPlaying := True; + end; + finally + LeaveCriticalSection(CriticalSection); + end; +end; + +function GetOfflineTts: TSherpaOnnxOfflineTts; +var + Config: TSherpaOnnxOfflineTtsConfig; +begin + Config.Model.Matcha.AcousticModel := './matcha-icefall-zh-baker/model-steps-3.onnx'; + Config.Model.Matcha.Vocoder := './hifigan_v2.onnx'; + Config.Model.Matcha.Lexicon := './matcha-icefall-zh-baker/lexicon.txt'; + Config.Model.Matcha.Tokens := './matcha-icefall-zh-baker/tokens.txt'; + Config.Model.Matcha.DictDir := './matcha-icefall-zh-baker/dict'; + Config.Model.NumThreads := 1; + Config.Model.Debug := False; + Config.RuleFsts := './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst'; + Config.MaxNumSentences := 1; + + Result := TSherpaOnnxOfflineTts.Create(Config); +end; + +begin + Tts := GetOfflineTts; + if Tts.GetSampleRate <> DeviceSampleRate then + Resampler := TSherpaOnnxLinearResampler.Create(Tts.GetSampleRate, DeviceSampleRate); + + Version := String(Pa_GetVersionText); + WriteLn('Version is ', Version); + Status := Pa_Initialize; + if Status <> paNoError then + begin + WriteLn('Failed to initialize portaudio, ', Pa_GetErrorText(Status)); + Exit; + end; + + NumDevices := Pa_GetDeviceCount; + WriteLn('Num devices: ', NumDevices); + + DeviceIndex := Pa_GetDefaultOutputDevice; + + if DeviceIndex = paNoDevice then + begin + WriteLn('No default output device found'); + Pa_Terminate; + Exit; + end; + + EnvStr := GetEnv('SHERPA_ONNX_MIC_DEVICE'); + if EnvStr <> '' then + begin + DeviceIndex := StrToIntDef(EnvStr, DeviceIndex); + WriteLn('Use device index from environment variable SHERPA_ONNX_MIC_DEVICE: ', EnvStr); + end; + + for I := 0 to (NumDevices - 1) do + begin + DeviceInfo := Pa_GetDeviceInfo(I); + if I = DeviceIndex then + { WriteLn(Format(' * %d %s', [I, DeviceInfo^.Name])) } + WriteLn(Format(' * %d %s', [I, AnsiString(DeviceInfo^.Name)])) + else + WriteLn(Format(' %d %s', [I, AnsiString(DeviceInfo^.Name)])); + end; + + WriteLn('Use device ', DeviceIndex); + WriteLn(' Name ', Pa_GetDeviceInfo(DeviceIndex)^.Name); + WriteLn(' Max output channels ', Pa_GetDeviceInfo(DeviceIndex)^.MaxOutputChannels); + + Initialize(Param); + Param.Device := DeviceIndex; + Param.ChannelCount := 1; + Param.SampleFormat := paFloat32; + param.SuggestedLatency := Pa_GetDeviceInfo(DeviceIndex)^.DefaultHighOutputLatency; + param.HostApiSpecificStreamInfo := nil; + + Buffer := TSherpaOnnxCircularBuffer.Create(30 * DeviceSampleRate); + + + { Note(fangjun): PortAudio invokes PlayCallback in a separate thread. } + Status := Pa_OpenStream(stream, nil, @Param, DeviceSampleRate, paFramesPerBufferUnspecified, paNoFlag, + PPaStreamCallback(@PlayCallback), nil); + + if Status <> paNoError then + begin + WriteLn('Failed to open stream, ', Pa_GetErrorText(Status)); + Pa_Terminate; + Exit; + end; + + InitCriticalSection(CriticalSection); + + Status := Pa_StartStream(stream); + if Status <> paNoError then + begin + WriteLn('Failed to start stream, ', Pa_GetErrorText(Status)); + Pa_Terminate; + Exit; + end; + + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); + + Text := '某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。'; + + Audio := Tts.Generate(Text, SpeakerId, Speed, + PSherpaOnnxGeneratedAudioCallbackWithArg(@GenerateCallback), nil); + FinishedGeneration := True; + SherpaOnnxWriteWave('./matcha-zh-playback.wav', Audio.Samples, Audio.SampleRate); + WriteLn('Saved to ./matcha-zh-playback.wav'); + + while not FinishedPlaying do + Pa_Sleep(100); {sleep for 0.1 second } + {TODO(fangjun): Use an event to indicate the play is finished} + + DoneCriticalSection(CriticalSection); + + FreeAndNil(Tts); + FreeAndNil(Resampler); + + Status := Pa_CloseStream(stream); + if Status <> paNoError then + begin + WriteLn('Failed to close stream, ', Pa_GetErrorText(Status)); + Exit; + end; + + Status := Pa_Terminate; + if Status <> paNoError then + begin + WriteLn('Failed to deinitialize portaudio, ', Pa_GetErrorText(Status)); + Exit; + end; +end. + diff --git a/pascal-api-examples/tts/matcha-zh.pas b/pascal-api-examples/tts/matcha-zh.pas new file mode 100644 index 000000000..c94100952 --- /dev/null +++ b/pascal-api-examples/tts/matcha-zh.pas @@ -0,0 +1,57 @@ +{ Copyright (c) 2025 Xiaomi Corporation } +program matcha_zh; +{ +This file shows how to use the text to speech API of sherpa-onnx +with MatchaTTS models. + +It generates speech from text and saves it to a wave file. + +If you want to play it while it is generating, please see +./matcha-zh-playback.pas +} + +{$mode objfpc} + +uses + SysUtils, + sherpa_onnx; + +function GetOfflineTts: TSherpaOnnxOfflineTts; +var + Config: TSherpaOnnxOfflineTtsConfig; +begin + Config.Model.Matcha.AcousticModel := './matcha-icefall-zh-baker/model-steps-3.onnx'; + Config.Model.Matcha.Vocoder := './hifigan_v2.onnx'; + Config.Model.Matcha.Lexicon := './matcha-icefall-zh-baker/lexicon.txt'; + Config.Model.Matcha.Tokens := './matcha-icefall-zh-baker/tokens.txt'; + Config.Model.Matcha.DictDir := './matcha-icefall-zh-baker/dict'; + Config.Model.NumThreads := 1; + Config.Model.Debug := False; + Config.RuleFsts := './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst'; + Config.MaxNumSentences := 1; + + Result := TSherpaOnnxOfflineTts.Create(Config); +end; + +var + Tts: TSherpaOnnxOfflineTts; + Audio: TSherpaOnnxGeneratedAudio; + + Text: AnsiString; + Speed: Single = 1.0; {Use a larger value to speak faster} + SpeakerId: Integer = 0; + +begin + Tts := GetOfflineTts; + + WriteLn('There are ', Tts.GetNumSpeakers, ' speakers'); + + Text := '某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。'; + + Audio := Tts.Generate(Text, SpeakerId, Speed); + SherpaOnnxWriteWave('./matcha-zh.wav', Audio.Samples, Audio.SampleRate); + WriteLn('Saved to ./matcha-zh.wav'); + + FreeAndNil(Tts); +end. + diff --git a/pascal-api-examples/tts/piper-playback.pas b/pascal-api-examples/tts/piper-playback.pas index b9cd10b71..5e65a34f0 100644 --- a/pascal-api-examples/tts/piper-playback.pas +++ b/pascal-api-examples/tts/piper-playback.pas @@ -1,5 +1,5 @@ { Copyright (c) 2024 Xiaomi Corporation } -program piper; +program piper_playback; { This file shows how to use the text to speech API of sherpa-onnx with Piper models. diff --git a/pascal-api-examples/tts/run-matcha-en-playback.sh b/pascal-api-examples/tts/run-matcha-en-playback.sh new file mode 100755 index 000000000..ffa677e94 --- /dev/null +++ b/pascal-api-examples/tts/run-matcha-en-playback.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2 + rm matcha-icefall-en_US-ljspeech.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + -Fl/usr/local/Cellar/portaudio/19.7.0/lib \ + ./matcha-en-playback.pas + +# Please see ../portaudio-test/README.md +# for how to install portaudio on macOS + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./matcha-en-playback diff --git a/pascal-api-examples/tts/run-matcha-en.sh b/pascal-api-examples/tts/run-matcha-en.sh new file mode 100755 index 000000000..084e672b1 --- /dev/null +++ b/pascal-api-examples/tts/run-matcha-en.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2 + rm matcha-icefall-en_US-ljspeech.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./matcha-en.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./matcha-en diff --git a/pascal-api-examples/tts/run-matcha-zh-playback.sh b/pascal-api-examples/tts/run-matcha-zh-playback.sh new file mode 100755 index 000000000..e12ad22af --- /dev/null +++ b/pascal-api-examples/tts/run-matcha-zh-playback.sh @@ -0,0 +1,52 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 + tar xvf matcha-icefall-zh-baker.tar.bz2 + rm matcha-icefall-zh-baker.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + -Fl/usr/local/Cellar/portaudio/19.7.0/lib \ + ./matcha-zh-playback.pas + +# Please see ../portaudio-test/README.md +# for how to install portaudio on macOS + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./matcha-zh-playback diff --git a/pascal-api-examples/tts/run-matcha-zh.sh b/pascal-api-examples/tts/run-matcha-zh.sh new file mode 100755 index 000000000..a7d83d379 --- /dev/null +++ b/pascal-api-examples/tts/run-matcha-zh.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd) + +echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR" + +if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then + mkdir -p ../../build + pushd ../../build + cmake \ + -DCMAKE_INSTALL_PREFIX=./install \ + -DSHERPA_ONNX_ENABLE_PYTHON=OFF \ + -DSHERPA_ONNX_ENABLE_TESTS=OFF \ + -DSHERPA_ONNX_ENABLE_CHECK=OFF \ + -DBUILD_SHARED_LIBS=ON \ + -DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ + .. + + cmake --build . --target install --config Release + popd +fi + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 + tar xvf matcha-icefall-zh-baker.tar.bz2 + rm matcha-icefall-zh-baker.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + +fpc \ + -dSHERPA_ONNX_USE_SHARED_LIBS \ + -Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \ + -Fl$SHERPA_ONNX_DIR/build/install/lib \ + ./matcha-zh.pas + +export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH +export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH + +./matcha-zh diff --git a/sherpa-onnx/pascal-api/sherpa_onnx.pas b/sherpa-onnx/pascal-api/sherpa_onnx.pas index cff215759..442c8a504 100644 --- a/sherpa-onnx/pascal-api/sherpa_onnx.pas +++ b/sherpa-onnx/pascal-api/sherpa_onnx.pas @@ -62,11 +62,26 @@ TSherpaOnnxOfflineTtsVitsModelConfig = record class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsVitsModelConfig); end; + TSherpaOnnxOfflineTtsMatchaModelConfig = record + AcousticModel: AnsiString; + Vocoder: AnsiString; + Lexicon: AnsiString; + Tokens: AnsiString; + DataDir: AnsiString; + NoiseScale: Single; + LengthScale: Single; + DictDir: AnsiString; + + function ToString: AnsiString; + class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsMatchaModelConfig); + end; + TSherpaOnnxOfflineTtsModelConfig = record Vits: TSherpaOnnxOfflineTtsVitsModelConfig; NumThreads: Integer; Debug: Boolean; Provider: AnsiString; + Matcha: TSherpaOnnxOfflineTtsMatchaModelConfig; function ToString: AnsiString; class operator Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsModelConfig); @@ -713,11 +728,23 @@ SherpaOnnxOfflineTtsVitsModelConfig = record DictDir: PAnsiChar; end; + SherpaOnnxOfflineTtsMatchaModelConfig = record + AcousticModel: PAnsiChar; + Vocoder: PAnsiChar; + Lexicon: PAnsiChar; + Tokens: PAnsiChar; + DataDir: PAnsiChar; + NoiseScale: cfloat; + LengthScale: cfloat; + DictDir: PAnsiChar; + end; + SherpaOnnxOfflineTtsModelConfig = record Vits: SherpaOnnxOfflineTtsVitsModelConfig; NumThreads: cint32; Debug: cint32; Provider: PAnsiChar; + Matcha: SherpaOnnxOfflineTtsMatchaModelConfig; end; SherpaOnnxOfflineTtsConfig = record @@ -1853,15 +1880,40 @@ function TSherpaOnnxOfflineTtsVitsModelConfig.ToString: AnsiString; Dest.LengthScale := 1.0; end; +function TSherpaOnnxOfflineTtsMatchaModelConfig.ToString: AnsiString; +begin + Result := Format('TSherpaOnnxOfflineTtsMatchaModelConfig(' + + 'AcousticModel := %s, ' + + 'Vocoder := %s, ' + + 'Lexicon := %s, ' + + 'Tokens := %s, ' + + 'DataDir := %s, ' + + 'NoiseScale := %.2f, ' + + 'LengthScale := %.2f, ' + + 'DictDir := %s' + + ')', + [Self.AcousticModel, Self.Vocoder, Self.Lexicon, Self.Tokens, + Self.DataDir, Self.NoiseScale, Self.LengthScale, Self.DictDir + ]); +end; + +class operator TSherpaOnnxOfflineTtsMatchaModelConfig.Initialize({$IFDEF FPC}var{$ELSE}out{$ENDIF} Dest: TSherpaOnnxOfflineTtsMatchaModelConfig); +begin + Dest.NoiseScale := 0.667; + Dest.LengthScale := 1.0; +end; + function TSherpaOnnxOfflineTtsModelConfig.ToString: AnsiString; begin Result := Format('TSherpaOnnxOfflineTtsModelConfig(' + 'Vits := %s, ' + 'NumThreads := %d, ' + 'Debug := %s, ' + - 'Provider := %s' + + 'Provider := %s, ' + + 'Matcha := %s' + ')', - [Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider + [Self.Vits.ToString, Self.NumThreads, Self.Debug.ToString, Self.Provider, + Self.Matcha.ToString ]); end; @@ -1905,6 +1957,15 @@ constructor TSherpaOnnxOfflineTts.Create(Config: TSherpaOnnxOfflineTtsConfig); C.Model.Vits.LengthScale := Config.Model.Vits.LengthScale; C.Model.Vits.DictDir := PAnsiChar(Config.Model.Vits.DictDir); + C.Model.Matcha.AcousticModel := PAnsiChar(Config.Model.Matcha.AcousticModel); + C.Model.Matcha.Vocoder := PAnsiChar(Config.Model.Matcha.Vocoder); + C.Model.Matcha.Lexicon := PAnsiChar(Config.Model.Matcha.Lexicon); + C.Model.Matcha.Tokens := PAnsiChar(Config.Model.Matcha.Tokens); + C.Model.Matcha.DataDir := PAnsiChar(Config.Model.Matcha.DataDir); + C.Model.Matcha.NoiseScale := Config.Model.Matcha.NoiseScale; + C.Model.Matcha.LengthScale := Config.Model.Matcha.LengthScale; + C.Model.Matcha.DictDir := PAnsiChar(Config.Model.Matcha.DictDir); + C.Model.NumThreads := Config.Model.NumThreads; C.Model.Provider := PAnsiChar(Config.Model.Provider); C.Model.Debug := Ord(Config.Model.Debug);