From 3eced3e7ee0ba5a51a3cba9fd819a5476940353f Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Sun, 5 Jan 2025 15:08:19 +0800 Subject: [PATCH] Add C# and JavaScript (wasm) API for MatchaTTS models (#1682) --- .github/scripts/test-dot-net.sh | 32 +++-- .github/scripts/test-nodejs-npm.sh | 54 ++++++-- .github/workflows/test-dot-net.yaml | 44 +++++++ dotnet-examples/offline-tts-play/Program.cs | 109 ++++++++++++---- .../offline-tts-play/run-hf-fanchen.sh | 4 +- .../offline-tts-play/run-matcha-en.sh | 26 ++++ .../offline-tts-play/run-matcha-zh.sh | 27 ++++ dotnet-examples/offline-tts-play/run-piper.sh | 4 +- dotnet-examples/offline-tts/Program.cs | 74 +++++++++-- dotnet-examples/offline-tts/run-aishell3.sh | 4 +- dotnet-examples/offline-tts/run-hf-fanchen.sh | 6 +- dotnet-examples/offline-tts/run-matcha-en.sh | 26 ++++ dotnet-examples/offline-tts/run-matcha-zh.sh | 27 ++++ dotnet-examples/offline-tts/run-piper.sh | 4 +- nodejs-examples/README.md | 48 ++++++- nodejs-examples/test-offline-tts-matcha-en.js | 40 ++++++ nodejs-examples/test-offline-tts-matcha-zh.js | 41 ++++++ ...-tts-en.js => test-offline-tts-vits-en.js} | 4 +- ...-tts-zh.js => test-offline-tts-vits-zh.js} | 4 +- scripts/dotnet/OfflineTtsMatchaModelConfig.cs | 44 +++++++ scripts/dotnet/OfflineTtsModelConfig.cs | 5 +- scripts/dotnet/examples/Common.csproj | 2 +- scripts/dotnet/sherpa-onnx.csproj.in | 2 +- scripts/dotnet/sherpa-onnx.csproj.runtime.in | 2 +- wasm/tts/sherpa-onnx-tts.js | 117 +++++++++++++++++- wasm/tts/sherpa-onnx-wasm-main-tts.cc | 15 ++- 26 files changed, 677 insertions(+), 88 deletions(-) create mode 100755 dotnet-examples/offline-tts-play/run-matcha-en.sh create mode 100755 dotnet-examples/offline-tts-play/run-matcha-zh.sh create mode 100755 dotnet-examples/offline-tts/run-matcha-en.sh create mode 100755 dotnet-examples/offline-tts/run-matcha-zh.sh create mode 100644 nodejs-examples/test-offline-tts-matcha-en.js create mode 100644 nodejs-examples/test-offline-tts-matcha-zh.js rename nodejs-examples/{test-offline-tts-en.js => test-offline-tts-vits-en.js} (92%) rename nodejs-examples/{test-offline-tts-zh.js => test-offline-tts-vits-zh.js} (92%) create mode 100644 scripts/dotnet/OfflineTtsMatchaModelConfig.cs diff --git a/.github/scripts/test-dot-net.sh b/.github/scripts/test-dot-net.sh index f4bfc66c5..7c339e157 100755 --- a/.github/scripts/test-dot-net.sh +++ b/.github/scripts/test-dot-net.sh @@ -2,7 +2,27 @@ cd dotnet-examples/ -cd ./offline-speaker-diarization +cd ./offline-tts +./run-matcha-zh.sh +ls -lh *.wav +./run-matcha-en.sh +ls -lh *.wav +./run-aishell3.sh +ls -lh *.wav +./run-piper.sh +ls -lh *.wav +./run-hf-fanchen.sh +ls -lh *.wav +ls -lh + +pushd ../.. + +mkdir tts + +cp dotnet-examples/offline-tts/*.wav ./tts +popd + +cd ../offline-speaker-diarization ./run.sh rm -rfv *.onnx rm -fv *.wav @@ -76,14 +96,4 @@ cd ../spoken-language-identification ./run.sh rm -rf sherpa-onnx-* -cd ../offline-tts -./run-aishell3.sh -./run-piper.sh -./run-hf-fanchen.sh -ls -lh -cd ../.. - -mkdir tts - -cp dotnet-examples/offline-tts/*.wav ./tts diff --git a/.github/scripts/test-nodejs-npm.sh b/.github/scripts/test-nodejs-npm.sh index 518d173b6..967944843 100755 --- a/.github/scripts/test-nodejs-npm.sh +++ b/.github/scripts/test-nodejs-npm.sh @@ -9,6 +9,48 @@ git status ls -lh ls -lh node_modules +# offline tts +# +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 +tar xvf matcha-icefall-zh-baker.tar.bz2 +rm matcha-icefall-zh-baker.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +node ./test-offline-tts-matcha-zh.js + +rm -rf matcha-icefall-zh-baker +rm hifigan_v2.onnx + +echo "---" + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 +rm matcha-icefall-en_US-ljspeech.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +node ./test-offline-tts-matcha-en.js + +rm -rf matcha-icefall-en_US-ljspeech +rm hifigan_v2.onnx + +echo "---" + +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 +tar xf vits-piper-en_US-amy-low.tar.bz2 +node ./test-offline-tts-vits-en.js +rm -rf vits-piper-en_US-amy-low* + +echo "---" + +curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 +tar xvf vits-icefall-zh-aishell3.tar.bz2 +node ./test-offline-tts-vits-zh.js +rm -rf vits-icefall-zh-aishell3* + +ls -lh *.wav + echo '-----speaker diarization----------' curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 @@ -147,15 +189,3 @@ tar xvf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 rm sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18.tar.bz2 node ./test-online-zipformer2-ctc-hlg.js rm -rf sherpa-onnx-streaming-zipformer-ctc-small-2024-03-18 - -# offline tts - -curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 -tar xf vits-piper-en_US-amy-low.tar.bz2 -node ./test-offline-tts-en.js -rm -rf vits-piper-en_US-amy-low* - -curl -LS -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 -tar xvf vits-icefall-zh-aishell3.tar.bz2 -node ./test-offline-tts-zh.js -rm -rf vits-icefall-zh-aishell3* diff --git a/.github/workflows/test-dot-net.yaml b/.github/workflows/test-dot-net.yaml index cc20c9ab5..8ca19439f 100644 --- a/.github/workflows/test-dot-net.yaml +++ b/.github/workflows/test-dot-net.yaml @@ -92,6 +92,50 @@ jobs: python-version: ["3.8"] steps: + - name: Check space + shell: bash + run: | + df -h + + - name: Free space + shell: bash + run: | + df -h + rm -rf /opt/hostedtoolcache + df -h + + - name: Free more space + shell: bash + run: | + # https://github.com/orgs/community/discussions/25678 + cd /opt + find . -maxdepth 1 -mindepth 1 '!' -path ./containerd '!' -path ./actionarchivecache '!' -path ./runner '!' -path ./runner-cache -exec rm -rf '{}' ';' + + sudo rm -rf /usr/share/dotnet + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" + + - name: Free Disk Space (Ubuntu) + uses: jlumbroso/free-disk-space@main + with: + # this might remove tools that are actually needed, + # if set to "true" but frees about 6 GB + tool-cache: false + + # all of these default to true, but feel free to set to + # "false" if necessary for your workflow + android: true + dotnet: false + haskell: true + large-packages: true + docker-images: false + swap-storage: true + + - name: Check space + shell: bash + run: | + df -h + - uses: actions/checkout@v4 with: fetch-depth: 0 diff --git a/dotnet-examples/offline-tts-play/Program.cs b/dotnet-examples/offline-tts-play/Program.cs index 65eb22bf4..543a50cdd 100644 --- a/dotnet-examples/offline-tts-play/Program.cs +++ b/dotnet-examples/offline-tts-play/Program.cs @@ -21,48 +21,56 @@ class OfflineTtsPlayDemo { class Options { - [Option("tts-rule-fsts", Required = false, Default = "", HelpText = "path to rule.fst")] - public string? RuleFsts { get; set; } + public string RuleFsts { get; set; } = string.Empty; + + [Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")] + public string RuleFars { get; set; } = string.Empty; - [Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")] - public string? DictDir { get; set; } + [Option("dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")] + public string DictDir { get; set; } = string.Empty; - [Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")] - public string? DataDir { get; set; } + [Option("data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")] + public string DataDir { get; set; } = string.Empty; - [Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")] - public float LengthScale { get; set; } + [Option("length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")] + public float LengthScale { get; set; } = 1; - [Option("vits-noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS models")] - public float NoiseScale { get; set; } + [Option("noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS or Matcha models")] + public float NoiseScale { get; set; } = 0.667F; - [Option("vits-noise-scale-w", Required = false, Default = 0.8f, HelpText = "noise_scale_w for VITS models")] - public float NoiseScaleW { get; set; } + [Option("vits-noise-scale-w", Required = false, Default = 0.8F, HelpText = "noise_scale_w for VITS models")] + public float NoiseScaleW { get; set; } = 0.8F; - [Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")] - public string? Lexicon { get; set; } + [Option("lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")] + public string Lexicon { get; set; } = string.Empty; - [Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")] - public string? Tokens { get; set; } + [Option("tokens", Required = true, Default = "", HelpText = "Path to tokens.txt")] + public string Tokens { get; set; } = string.Empty; [Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")] - public int MaxNumSentences { get; set; } + public int MaxNumSentences { get; set; } = 1; [Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")] - public int Debug { get; set; } + public int Debug { get; set; } = 0; + + [Option("vits-model", Required = false, HelpText = "Path to VITS model")] + public string Model { get; set; } = string.Empty; - [Option("vits-model", Required = true, HelpText = "Path to VITS model")] - public string? Model { get; set; } + [Option("matcha-acoustic-model", Required = false, HelpText = "Path to the acoustic model of Matcha")] + public string AcousticModel { get; set; } = ""; + + [Option("matcha-vocoder", Required = false, HelpText = "Path to the vocoder model of Matcha")] + public string Vocoder { get; set; } = ""; [Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")] - public int SpeakerId { get; set; } + public int SpeakerId { get; set; } = 0; [Option("text", Required = true, HelpText = "Text to synthesize")] - public string? Text { get; set; } + public string Text { get; set; } = string.Empty; [Option("output-filename", Required = true, Default = "./generated.wav", HelpText = "Path to save the generated audio")] - public string? OutputFilename { get; set; } + public string OutputFilename { get; set; } = "./generated.wav"; } static void Main(string[] args) @@ -78,6 +86,42 @@ static void Main(string[] args) private static void DisplayHelp(ParserResult result, IEnumerable errs) { string usage = @" +# matcha-icefall-zh-baker + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 +tar xvf matcha-icefall-zh-baker.tar.bz2 +rm matcha-icefall-zh-baker.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +dotnet run \ + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --lexicon=./matcha-icefall-zh-baker/lexicon.txt \ + --tokens=./matcha-icefall-zh-baker/tokens.txt \ + --dict-dir=./matcha-icefall-zh-baker/dict \ + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ + --debug=1 \ + --output-filename=./matcha-zh.wav \ + --text='某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。' + +# matcha-icefall-en_US-ljspeech + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 +rm matcha-icefall-en_US-ljspeech.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +dotnet run \ + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --tokens=./matcha-icefall-zh-baker/tokens.txt \ + --data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ + --debug=1 \ + --output-filename=./matcha-zh.wav \ + --text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' + # vits-aishell3 wget -qq https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 @@ -85,8 +129,8 @@ tar xf vits-zh-aishell3.tar.bz2 dotnet run \ --vits-model=./vits-zh-aishell3/vits-aishell3.onnx \ - --vits-tokens=./vits-zh-aishell3/tokens.txt \ - --vits-lexicon=./vits-zh-aishell3/lexicon.txt \ + --tokens=./vits-zh-aishell3/tokens.txt \ + --lexicon=./vits-zh-aishell3/lexicon.txt \ --tts-rule-fsts=./vits-zh-aishell3/rule.fst \ --sid=66 \ --debug=1 \ @@ -100,8 +144,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2 dotnet run \ --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ - --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ - --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ + ---tokens=./vits-piper-en_US-amy-low/tokens.txt \ + --data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ --debug=1 \ --output-filename=./amy.wav \ --text='This is a text to speech application in dotnet with Next Generation Kaldi' @@ -124,6 +168,7 @@ to download more models. private static void Run(Options options) { var config = new OfflineTtsConfig(); + config.Model.Vits.Model = options.Model; config.Model.Vits.Lexicon = options.Lexicon; config.Model.Vits.Tokens = options.Tokens; @@ -132,6 +177,16 @@ private static void Run(Options options) config.Model.Vits.NoiseScale = options.NoiseScale; config.Model.Vits.NoiseScaleW = options.NoiseScaleW; config.Model.Vits.LengthScale = options.LengthScale; + + config.Model.Matcha.AcousticModel = options.AcousticModel; + config.Model.Matcha.Vocoder = options.Vocoder; + config.Model.Matcha.Lexicon = options.Lexicon; + config.Model.Matcha.Tokens = options.Tokens; + config.Model.Matcha.DataDir = options.DataDir; + config.Model.Matcha.DictDir = options.DictDir; + config.Model.Matcha.NoiseScale = options.NoiseScale; + config.Model.Matcha.LengthScale = options.LengthScale; + config.Model.NumThreads = 1; config.Model.Debug = options.Debug; config.Model.Provider = "cpu"; diff --git a/dotnet-examples/offline-tts-play/run-hf-fanchen.sh b/dotnet-examples/offline-tts-play/run-hf-fanchen.sh index b16a3ca68..84e668578 100755 --- a/dotnet-examples/offline-tts-play/run-hf-fanchen.sh +++ b/dotnet-examples/offline-tts-play/run-hf-fanchen.sh @@ -8,8 +8,8 @@ fi dotnet run \ --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ - --vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \ - --vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ + --tokens=./vits-zh-hf-fanchen-C/tokens.txt \ + --lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ --tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \ --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \ --sid=100 \ diff --git a/dotnet-examples/offline-tts-play/run-matcha-en.sh b/dotnet-examples/offline-tts-play/run-matcha-en.sh new file mode 100755 index 000000000..0f7caa215 --- /dev/null +++ b/dotnet-examples/offline-tts-play/run-matcha-en.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -ex + + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2 + rm matcha-icefall-en_US-ljspeech.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + +dotnet run \ + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ + --data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ + --debug=1 \ + --output-filename=./matcha-en.wav \ + --text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' diff --git a/dotnet-examples/offline-tts-play/run-matcha-zh.sh b/dotnet-examples/offline-tts-play/run-matcha-zh.sh new file mode 100755 index 000000000..e3b34268c --- /dev/null +++ b/dotnet-examples/offline-tts-play/run-matcha-zh.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -ex + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 + tar xvf matcha-icefall-zh-baker.tar.bz2 + rm matcha-icefall-zh-baker.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + + +dotnet run \ + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --lexicon=./matcha-icefall-zh-baker/lexicon.txt \ + --tokens=./matcha-icefall-zh-baker/tokens.txt \ + --dict-dir=./matcha-icefall-zh-baker/dict \ + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ + --debug=1 \ + --output-filename=./matcha-zh.wav \ + --text="某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" diff --git a/dotnet-examples/offline-tts-play/run-piper.sh b/dotnet-examples/offline-tts-play/run-piper.sh index 7c97498d2..1a4d10806 100755 --- a/dotnet-examples/offline-tts-play/run-piper.sh +++ b/dotnet-examples/offline-tts-play/run-piper.sh @@ -9,8 +9,8 @@ fi dotnet run \ --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ - --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ - --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ + --tokens=./vits-piper-en_US-amy-low/tokens.txt \ + --data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ --debug=1 \ --output-filename=./amy.wav \ --text="This is a text to speech application in dotnet with Next Generation Kaldi" diff --git a/dotnet-examples/offline-tts/Program.cs b/dotnet-examples/offline-tts/Program.cs index f434ebf19..21f90c525 100644 --- a/dotnet-examples/offline-tts/Program.cs +++ b/dotnet-examples/offline-tts/Program.cs @@ -20,25 +20,25 @@ class Options [Option("tts-rule-fars", Required = false, Default = "", HelpText = "path to rule.far")] public string RuleFars { get; set; } = string.Empty; - [Option("vits-dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")] + [Option("dict-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for jieba.")] public string DictDir { get; set; } = string.Empty; - [Option("vits-data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")] + [Option("data-dir", Required = false, Default = "", HelpText = "Path to the directory containing dict for espeak-ng.")] public string DataDir { get; set; } = string.Empty; - [Option("vits-length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")] + [Option("length-scale", Required = false, Default = 1, HelpText = "speech speed. Larger->Slower; Smaller->faster")] public float LengthScale { get; set; } = 1; - [Option("vits-noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS models")] + [Option("noise-scale", Required = false, Default = 0.667f, HelpText = "noise_scale for VITS or Matcha models")] public float NoiseScale { get; set; } = 0.667F; [Option("vits-noise-scale-w", Required = false, Default = 0.8F, HelpText = "noise_scale_w for VITS models")] public float NoiseScaleW { get; set; } = 0.8F; - [Option("vits-lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")] + [Option("lexicon", Required = false, Default = "", HelpText = "Path to lexicon.txt")] public string Lexicon { get; set; } = string.Empty; - [Option("vits-tokens", Required = false, Default = "", HelpText = "Path to tokens.txt")] + [Option("tokens", Required = true, Default = "", HelpText = "Path to tokens.txt")] public string Tokens { get; set; } = string.Empty; [Option("tts-max-num-sentences", Required = false, Default = 1, HelpText = "Maximum number of sentences that we process at a time.")] @@ -47,9 +47,15 @@ class Options [Option(Required = false, Default = 0, HelpText = "1 to show debug messages.")] public int Debug { get; set; } = 0; - [Option("vits-model", Required = true, HelpText = "Path to VITS model")] + [Option("vits-model", Required = false, HelpText = "Path to VITS model")] public string Model { get; set; } = string.Empty; + [Option("matcha-acoustic-model", Required = false, HelpText = "Path to the acoustic model of Matcha")] + public string AcousticModel { get; set; } = ""; + + [Option("matcha-vocoder", Required = false, HelpText = "Path to the vocoder model of Matcha")] + public string Vocoder { get; set; } = ""; + [Option("sid", Required = false, Default = 0, HelpText = "Speaker ID")] public int SpeakerId { get; set; } = 0; @@ -73,6 +79,42 @@ static void Main(string[] args) private static void DisplayHelp(ParserResult result, IEnumerable errs) { var usage = @" +# matcha-icefall-zh-baker + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 +tar xvf matcha-icefall-zh-baker.tar.bz2 +rm matcha-icefall-zh-baker.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +dotnet run \ + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --lexicon=./matcha-icefall-zh-baker/lexicon.txt \ + --tokens=./matcha-icefall-zh-baker/tokens.txt \ + --dict-dir=./matcha-icefall-zh-baker/dict \ + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ + --debug=1 \ + --output-filename=./matcha-zh.wav \ + --text='某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。' + +# matcha-icefall-en_US-ljspeech + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 +rm matcha-icefall-en_US-ljspeech.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +dotnet run \ + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --tokens=./matcha-icefall-zh-baker/tokens.txt \ + --data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ + --debug=1 \ + --output-filename=./matcha-zh.wav \ + --text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' + # vits-aishell3 curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 @@ -80,8 +122,8 @@ tar xvf vits-icefall-zh-aishell3.tar.bz2 dotnet run \ --vits-model=./vits-icefall-zh-aishell3/model.onnx \ - --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ - --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \ --sid=66 \ @@ -96,8 +138,8 @@ tar xf vits-piper-en_US-amy-low.tar.bz2 dotnet run \ --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ - --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ - --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ + --tokens=./vits-piper-en_US-amy-low/tokens.txt \ + --data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ --debug=1 \ --output-filename=./amy.wav \ --text='This is a text to speech application in dotnet with Next Generation Kaldi' @@ -128,6 +170,16 @@ private static void Run(Options options) config.Model.Vits.NoiseScale = options.NoiseScale; config.Model.Vits.NoiseScaleW = options.NoiseScaleW; config.Model.Vits.LengthScale = options.LengthScale; + + config.Model.Matcha.AcousticModel = options.AcousticModel; + config.Model.Matcha.Vocoder = options.Vocoder; + config.Model.Matcha.Lexicon = options.Lexicon; + config.Model.Matcha.Tokens = options.Tokens; + config.Model.Matcha.DataDir = options.DataDir; + config.Model.Matcha.DictDir = options.DictDir; + config.Model.Matcha.NoiseScale = options.NoiseScale; + config.Model.Matcha.LengthScale = options.LengthScale; + config.Model.NumThreads = 1; config.Model.Debug = options.Debug; config.Model.Provider = "cpu"; diff --git a/dotnet-examples/offline-tts/run-aishell3.sh b/dotnet-examples/offline-tts/run-aishell3.sh index 02380f07c..9a54df349 100755 --- a/dotnet-examples/offline-tts/run-aishell3.sh +++ b/dotnet-examples/offline-tts/run-aishell3.sh @@ -8,8 +8,8 @@ fi dotnet run \ --vits-model=./vits-icefall-zh-aishell3/model.onnx \ - --vits-tokens=./vits-icefall-zh-aishell3/tokens.txt \ - --vits-lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ + --tokens=./vits-icefall-zh-aishell3/tokens.txt \ + --lexicon=./vits-icefall-zh-aishell3/lexicon.txt \ --tts-rule-fsts=./vits-icefall-zh-aishell3/phone.fst,./vits-icefall-zh-aishell3/date.fst,./vits-icefall-zh-aishell3/number.fst \ --tts-rule-fars=./vits-icefall-zh-aishell3/rule.far \ --sid=66 \ diff --git a/dotnet-examples/offline-tts/run-hf-fanchen.sh b/dotnet-examples/offline-tts/run-hf-fanchen.sh index b16a3ca68..a7a52e733 100755 --- a/dotnet-examples/offline-tts/run-hf-fanchen.sh +++ b/dotnet-examples/offline-tts/run-hf-fanchen.sh @@ -8,10 +8,10 @@ fi dotnet run \ --vits-model=./vits-zh-hf-fanchen-C/vits-zh-hf-fanchen-C.onnx \ - --vits-tokens=./vits-zh-hf-fanchen-C/tokens.txt \ - --vits-lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ + --tokens=./vits-zh-hf-fanchen-C/tokens.txt \ + --lexicon=./vits-zh-hf-fanchen-C/lexicon.txt \ --tts-rule-fsts=./vits-zh-hf-fanchen-C/phone.fst,./vits-zh-hf-fanchen-C/date.fst,./vits-zh-hf-fanchen-C/number.fst \ - --vits-dict-dir=./vits-zh-hf-fanchen-C/dict \ + --dict-dir=./vits-zh-hf-fanchen-C/dict \ --sid=100 \ --debug=1 \ --output-filename=./fanchen-100.wav \ diff --git a/dotnet-examples/offline-tts/run-matcha-en.sh b/dotnet-examples/offline-tts/run-matcha-en.sh new file mode 100755 index 000000000..0f7caa215 --- /dev/null +++ b/dotnet-examples/offline-tts/run-matcha-en.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash +set -ex + + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2 + rm matcha-icefall-en_US-ljspeech.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + +dotnet run \ + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ + --data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ + --debug=1 \ + --output-filename=./matcha-en.wav \ + --text='Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' diff --git a/dotnet-examples/offline-tts/run-matcha-zh.sh b/dotnet-examples/offline-tts/run-matcha-zh.sh new file mode 100755 index 000000000..e3b34268c --- /dev/null +++ b/dotnet-examples/offline-tts/run-matcha-zh.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +set -ex + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 + tar xvf matcha-icefall-zh-baker.tar.bz2 + rm matcha-icefall-zh-baker.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + + +dotnet run \ + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --lexicon=./matcha-icefall-zh-baker/lexicon.txt \ + --tokens=./matcha-icefall-zh-baker/tokens.txt \ + --dict-dir=./matcha-icefall-zh-baker/dict \ + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ + --debug=1 \ + --output-filename=./matcha-zh.wav \ + --text="某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" diff --git a/dotnet-examples/offline-tts/run-piper.sh b/dotnet-examples/offline-tts/run-piper.sh index ff639c570..273799bb3 100755 --- a/dotnet-examples/offline-tts/run-piper.sh +++ b/dotnet-examples/offline-tts/run-piper.sh @@ -10,8 +10,8 @@ fi dotnet run \ --vits-model=./vits-piper-en_US-amy-low/en_US-amy-low.onnx \ - --vits-tokens=./vits-piper-en_US-amy-low/tokens.txt \ - --vits-data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ + --tokens=./vits-piper-en_US-amy-low/tokens.txt \ + --data-dir=./vits-piper-en_US-amy-low/espeak-ng-data \ --debug=1 \ --output-filename=./amy.wav \ --text="This is a text to speech application in dotnet with Next Generation Kaldi" diff --git a/nodejs-examples/README.md b/nodejs-examples/README.md index a953573b6..0c59b7bcc 100644 --- a/nodejs-examples/README.md +++ b/nodejs-examples/README.md @@ -42,9 +42,45 @@ node ./test-offline-speaker-diarization.js In the following, we demonstrate how to run text-to-speech. -## ./test-offline-tts-en.js +## ./test-offline-tts-matcha-zh.js -[./test-offline-tts-en.js](./test-offline-tts-en.js) shows how to use +[./test-offline-tts-matcha-zh.js](./test-offline-tts-matcha-zh.js) shows how to use +[matcha-icefall-zh-baker](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker) +for text-to-speech. + +You can use the following command to run it: + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 +tar xvf matcha-icefall-zh-baker.tar.bz2 +rm matcha-icefall-zh-baker.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +node ./test-offline-tts-matcha-zh.js +``` + +## ./test-offline-tts-matcha-en.js + +[./test-offline-tts-matcha-en.js](./test-offline-tts-matcha-en.js) shows how to use +[matcha-icefall-en_US-ljspeech](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker) +for text-to-speech. + +You can use the following command to run it: + +```bash +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 +rm matcha-icefall-en_US-ljspeech.tar.bz2 + +wget https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +node ./test-offline-tts-matcha-en.js +``` + +## ./test-offline-tts-vits-en.js + +[./test-offline-tts-vits-en.js](./test-offline-tts-vits-en.js) shows how to use [vits-piper-en_US-amy-low.tar.bz2](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2) for text-to-speech. @@ -53,12 +89,12 @@ You can use the following command to run it: ```bash wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 tar xvf vits-piper-en_US-amy-low.tar.bz2 -node ./test-offline-tts-en.js +node ./test-offline-tts-vits-en.js ``` -## ./test-offline-tts-zh.js +## ./test-offline-tts-vits-zh.js -[./test-offline-tts-zh.js](./test-offline-tts-zh.js) shows how to use +[./test-offline-tts-vits-zh.js](./test-offline-tts-vits-zh.js) shows how to use a VITS pretrained model [aishell3](https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/vits.html#vits-model-aishell3) for text-to-speech. @@ -68,7 +104,7 @@ You can use the following command to run it: ```bash wget -q https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 tar xvf vits-icefall-zh-aishell3.tar.bz2 -node ./test-offline-tts-zh.js +node ./test-offline-tts-vits-zh.js ``` # Speech-to-text diff --git a/nodejs-examples/test-offline-tts-matcha-en.js b/nodejs-examples/test-offline-tts-matcha-en.js new file mode 100644 index 000000000..c2f982043 --- /dev/null +++ b/nodejs-examples/test-offline-tts-matcha-en.js @@ -0,0 +1,40 @@ +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +const sherpa_onnx = require('sherpa-onnx'); + +function createOfflineTts() { + let offlineTtsMatchaModelConfig = { + acousticModel: './matcha-icefall-en_US-ljspeech/model-steps-3.onnx', + vocoder: './hifigan_v2.onnx', + lexicon: './matcha-icefall-en_US-ljspeech/lexicon.txt', + tokens: './matcha-icefall-en_US-ljspeech/tokens.txt', + dataDir: './matcha-icefall-en_US-ljspeech/espeak-ng-data', + + noiseScale: 0.667, + lengthScale: 1.0, + }; + let offlineTtsModelConfig = { + offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig, + numThreads: 1, + debug: 1, + provider: 'cpu', + }; + + let offlineTtsConfig = { + offlineTtsModelConfig: offlineTtsModelConfig, + maxNumSentences: 1, + }; + + return sherpa_onnx.createOfflineTts(offlineTtsConfig); +} + +const tts = createOfflineTts(); +const speakerId = 0; +const speed = 1.0; +const text = + 'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.' + +const audio = tts.generate({text: text, sid: speakerId, speed: speed}); +tts.save('./test-matcha-en.wav', audio); +console.log('Saved to test-matcha-en.wav successfully.'); +tts.free(); diff --git a/nodejs-examples/test-offline-tts-matcha-zh.js b/nodejs-examples/test-offline-tts-matcha-zh.js new file mode 100644 index 000000000..21c6a0875 --- /dev/null +++ b/nodejs-examples/test-offline-tts-matcha-zh.js @@ -0,0 +1,41 @@ +// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +const sherpa_onnx = require('sherpa-onnx'); + +function createOfflineTts() { + let offlineTtsMatchaModelConfig = { + acousticModel: './matcha-icefall-zh-baker/model-steps-3.onnx', + vocoder: './hifigan_v2.onnx', + lexicon: './matcha-icefall-zh-baker/lexicon.txt', + tokens: './matcha-icefall-zh-baker/tokens.txt', + dictDir: './matcha-icefall-zh-baker/dict', + noiseScale: 0.667, + lengthScale: 1.0, + }; + let offlineTtsModelConfig = { + offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig, + numThreads: 1, + debug: 1, + provider: 'cpu', + }; + + let offlineTtsConfig = { + offlineTtsModelConfig: offlineTtsModelConfig, + maxNumSentences: 1, + ruleFsts: + './matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst', + }; + + return sherpa_onnx.createOfflineTts(offlineTtsConfig); +} + +const tts = createOfflineTts(); +const speakerId = 0; +const speed = 1.0; +const text = + '当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔. 某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。' + +const audio = tts.generate({text: text, sid: speakerId, speed: speed}); +tts.save('./test-matcha-zh.wav', audio); +console.log('Saved to test-matcha-zh.wav successfully.'); +tts.free(); diff --git a/nodejs-examples/test-offline-tts-en.js b/nodejs-examples/test-offline-tts-vits-en.js similarity index 92% rename from nodejs-examples/test-offline-tts-en.js rename to nodejs-examples/test-offline-tts-vits-en.js index 61c23f5eb..9e6c8da58 100644 --- a/nodejs-examples/test-offline-tts-en.js +++ b/nodejs-examples/test-offline-tts-vits-en.js @@ -37,7 +37,7 @@ const audio = tts.generate({ speed: speed }); -tts.save('./test-en.wav', audio); -console.log('Saved to test-en.wav successfully.'); +tts.save('./test-vits-en.wav', audio); +console.log('Saved to test-vits-en.wav successfully.'); tts.free(); diff --git a/nodejs-examples/test-offline-tts-zh.js b/nodejs-examples/test-offline-tts-vits-zh.js similarity index 92% rename from nodejs-examples/test-offline-tts-zh.js rename to nodejs-examples/test-offline-tts-vits-zh.js index 7be148862..78c81ead3 100644 --- a/nodejs-examples/test-offline-tts-zh.js +++ b/nodejs-examples/test-offline-tts-vits-zh.js @@ -34,6 +34,6 @@ const speakerId = 66; const speed = 1.0; const audio = tts.generate( {text: '3年前中国总人口是1411778724人', sid: speakerId, speed: speed}); -tts.save('./test-zh.wav', audio); -console.log('Saved to test-zh.wav successfully.'); +tts.save('./test-vits-zh.wav', audio); +console.log('Saved to test-vits-zh.wav successfully.'); tts.free(); diff --git a/scripts/dotnet/OfflineTtsMatchaModelConfig.cs b/scripts/dotnet/OfflineTtsMatchaModelConfig.cs new file mode 100644 index 000000000..8743e1223 --- /dev/null +++ b/scripts/dotnet/OfflineTtsMatchaModelConfig.cs @@ -0,0 +1,44 @@ +/// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang) + +using System.Runtime.InteropServices; + +namespace SherpaOnnx +{ + [StructLayout(LayoutKind.Sequential)] + public struct OfflineTtsMatchaModelConfig + { + public OfflineTtsMatchaModelConfig() + { + AcousticModel = ""; + Vocoder = ""; + Lexicon = ""; + Tokens = ""; + DataDir = ""; + + NoiseScale = 0.667F; + LengthScale = 1.0F; + + DictDir = ""; + } + [MarshalAs(UnmanagedType.LPStr)] + public string AcousticModel; + + [MarshalAs(UnmanagedType.LPStr)] + public string Vocoder; + + [MarshalAs(UnmanagedType.LPStr)] + public string Lexicon; + + [MarshalAs(UnmanagedType.LPStr)] + public string Tokens; + + [MarshalAs(UnmanagedType.LPStr)] + public string DataDir; + + public float NoiseScale; + public float LengthScale; + + [MarshalAs(UnmanagedType.LPStr)] + public string DictDir; + } +} diff --git a/scripts/dotnet/OfflineTtsModelConfig.cs b/scripts/dotnet/OfflineTtsModelConfig.cs index 40aa63912..e5caa1173 100644 --- a/scripts/dotnet/OfflineTtsModelConfig.cs +++ b/scripts/dotnet/OfflineTtsModelConfig.cs @@ -11,6 +11,7 @@ public struct OfflineTtsModelConfig public OfflineTtsModelConfig() { Vits = new OfflineTtsVitsModelConfig(); + Matcha = new OfflineTtsMatchaModelConfig(); NumThreads = 1; Debug = 0; Provider = "cpu"; @@ -21,5 +22,7 @@ public OfflineTtsModelConfig() public int Debug; [MarshalAs(UnmanagedType.LPStr)] public string Provider; + + public OfflineTtsMatchaModelConfig Matcha; } -} \ No newline at end of file +} diff --git a/scripts/dotnet/examples/Common.csproj b/scripts/dotnet/examples/Common.csproj index 868c1470f..6925e0381 100644 --- a/scripts/dotnet/examples/Common.csproj +++ b/scripts/dotnet/examples/Common.csproj @@ -1,7 +1,7 @@  - .net6 + net8.0 /tmp/packages;$(RestoreSources);https://api.nuget.org/v3/index.json diff --git a/scripts/dotnet/sherpa-onnx.csproj.in b/scripts/dotnet/sherpa-onnx.csproj.in index 0cbe7efab..d81a883e5 100644 --- a/scripts/dotnet/sherpa-onnx.csproj.in +++ b/scripts/dotnet/sherpa-onnx.csproj.in @@ -4,7 +4,7 @@ README.md Library 10.0 - net6.0;net45;net40;net35;net20;netstandard2.0 + net8.0;net7.0;net6.0;net45;net40;net35;net20;netstandard2.0 linux-x64;linux-arm64;osx-x64;osx-arm64;win-x64;win-x86;win-arm64 true sherpa-onnx diff --git a/scripts/dotnet/sherpa-onnx.csproj.runtime.in b/scripts/dotnet/sherpa-onnx.csproj.runtime.in index 2a387ccea..f21c3da37 100644 --- a/scripts/dotnet/sherpa-onnx.csproj.runtime.in +++ b/scripts/dotnet/sherpa-onnx.csproj.runtime.in @@ -3,7 +3,7 @@ Apache-2.0 README.md Library - net6.0;net45;net40;net35;net20;netstandard2.0 + net8.0;net7.0;net6.0;net45;net40;net35;net20;netstandard2.0 {{ dotnet_rid }} sherpa-onnx {{ version }} diff --git a/wasm/tts/sherpa-onnx-tts.js b/wasm/tts/sherpa-onnx-tts.js index 4d68b854f..59158ae76 100644 --- a/wasm/tts/sherpa-onnx-tts.js +++ b/wasm/tts/sherpa-onnx-tts.js @@ -8,6 +8,10 @@ function freeConfig(config, Module) { freeConfig(config.config, Module) } + if ('config2' in config) { + freeConfig(config.config2, Module) + } + Module._free(config.ptr); } @@ -66,11 +70,103 @@ function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { } } +function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) { + const acousticModelLen = Module.lengthBytesUTF8(config.acousticModel) + 1; + const vocoderLen = Module.lengthBytesUTF8(config.vocoder) + 1; + const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; + const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; + const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; + const dictDirLen = Module.lengthBytesUTF8(config.dictDir || '') + 1; + + const n = acousticModelLen + vocoderLen + lexiconLen + tokensLen + + dataDirLen + dictDirLen; + + const buffer = Module._malloc(n); + + const len = 8 * 4; + const ptr = Module._malloc(len); + + let offset = 0; + Module.stringToUTF8( + config.acousticModel || '', buffer + offset, acousticModelLen); + offset += acousticModelLen; + + Module.stringToUTF8(config.vocoder || '', buffer + offset, vocoderLen); + offset += vocoderLen; + + Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen); + offset += lexiconLen; + + Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen); + offset += tokensLen; + + Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen); + offset += dataDirLen; + + Module.stringToUTF8(config.dictDir || '', buffer + offset, dictDirLen); + offset += dictDirLen; + + offset = 0; + Module.setValue(ptr, buffer + offset, 'i8*'); + offset += acousticModelLen; + + Module.setValue(ptr + 4, buffer + offset, 'i8*'); + offset += vocoderLen; + + Module.setValue(ptr + 8, buffer + offset, 'i8*'); + offset += lexiconLen; + + Module.setValue(ptr + 12, buffer + offset, 'i8*'); + offset += tokensLen; + + Module.setValue(ptr + 16, buffer + offset, 'i8*'); + offset += dataDirLen; + + Module.setValue(ptr + 20, config.noiseScale || 0.667, 'float'); + Module.setValue(ptr + 24, config.lengthScale || 1.0, 'float'); + Module.setValue(ptr + 28, buffer + offset, 'i8*'); + offset += dictDirLen; + + return { + buffer: buffer, ptr: ptr, len: len, + } +} + function initSherpaOnnxOfflineTtsModelConfig(config, Module) { + if (!('offlineTtsVitsModelConfig' in config)) { + config.offlineTtsVitsModelConfig = { + model: '', + lexicon: '', + tokens: '', + noiseScale: 0.667, + noiseScaleW: 0.8, + lengthScale: 1.0, + dataDir: '', + dictDir: '', + }; + } + + if (!('offlineTtsMatchaModelConfig' in config)) { + config.offlineTtsMatchaModelConfig = { + acousticModel: '', + vocoder: '', + lexicon: '', + tokens: '', + noiseScale: 0.667, + lengthScale: 1.0, + dataDir: '', + dictDir: '', + }; + } + + const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig( config.offlineTtsVitsModelConfig, Module); - const len = vitsModelConfig.len + 3 * 4; + const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig( + config.offlineTtsMatchaModelConfig, Module); + + const len = vitsModelConfig.len + matchaModelConfig.len + 3 * 4; const ptr = Module._malloc(len); let offset = 0; @@ -87,9 +183,14 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) { const buffer = Module._malloc(providerLen); Module.stringToUTF8(config.provider, buffer, providerLen); Module.setValue(ptr + offset, buffer, 'i8*'); + offset += 4; + + Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset); + offset += matchaModelConfig.len; return { buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig, + config2: matchaModelConfig } } @@ -195,12 +296,26 @@ function createOfflineTts(Module, myConfig) { noiseScaleW: 0.8, lengthScale: 1.0, }; + + const offlineTtsMatchaModelConfig = { + acousticModel: '', + vocoder: '', + lexicon: '', + tokens: '', + dataDir: '', + dictDir: '', + noiseScale: 0.667, + lengthScale: 1.0, + }; + const offlineTtsModelConfig = { offlineTtsVitsModelConfig: offlineTtsVitsModelConfig, + offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig, numThreads: 1, debug: 1, provider: 'cpu', }; + let offlineTtsConfig = { offlineTtsModelConfig: offlineTtsModelConfig, ruleFsts: '', diff --git a/wasm/tts/sherpa-onnx-wasm-main-tts.cc b/wasm/tts/sherpa-onnx-wasm-main-tts.cc index 872a1c853..3508b860d 100644 --- a/wasm/tts/sherpa-onnx-wasm-main-tts.cc +++ b/wasm/tts/sherpa-onnx-wasm-main-tts.cc @@ -14,8 +14,10 @@ extern "C" { static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, ""); +static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) == - sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + 3 * 4, + sizeof(SherpaOnnxOfflineTtsVitsModelConfig) + + sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 3 * 4, ""); static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4, @@ -24,6 +26,7 @@ static_assert(sizeof(SherpaOnnxOfflineTtsConfig) == void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { auto tts_model_config = &tts_config->model; auto vits_model_config = &tts_model_config->vits; + auto matcha_model_config = &tts_model_config->matcha; fprintf(stdout, "----------vits model config----------\n"); fprintf(stdout, "model: %s\n", vits_model_config->model); fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon); @@ -34,6 +37,16 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) { fprintf(stdout, "length scale: %.3f\n", vits_model_config->length_scale); fprintf(stdout, "dict_dir: %s\n", vits_model_config->dict_dir); + fprintf(stdout, "----------matcha model config----------\n"); + fprintf(stdout, "acoustic_model: %s\n", matcha_model_config->acoustic_model); + fprintf(stdout, "vocoder: %s\n", matcha_model_config->vocoder); + fprintf(stdout, "lexicon: %s\n", matcha_model_config->lexicon); + fprintf(stdout, "tokens: %s\n", matcha_model_config->tokens); + fprintf(stdout, "data_dir: %s\n", matcha_model_config->data_dir); + fprintf(stdout, "noise scale: %.3f\n", matcha_model_config->noise_scale); + fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale); + fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir); + fprintf(stdout, "----------tts model config----------\n"); fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads); fprintf(stdout, "debug: %d\n", tts_model_config->debug);