From d7c95d33a351d8505335ff7735be68b37cb54f9f Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Mon, 6 Jan 2025 11:03:31 +0800 Subject: [PATCH] Add Dart API for MatchaTTS models (#1687) --- .github/scripts/test-dart.sh | 41 +++++---- .github/workflows/checksum.yaml | 1 + dart-api-examples/tts/bin/matcha-en.dart | 86 ++++++++++++++++++ dart-api-examples/tts/bin/matcha-zh.dart | 90 +++++++++++++++++++ .../tts/bin/{zh.dart => vits-zh.dart} | 0 dart-api-examples/tts/run-matcha-en.sh | 32 +++++++ dart-api-examples/tts/run-matcha-zh.sh | 45 ++++++++++ .../tts/{run-zh.sh => run-vits-zh.sh} | 8 +- .../lib/src/sherpa_onnx_bindings.dart | 17 ++++ flutter/sherpa_onnx/lib/src/tts.dart | 53 ++++++++++- 10 files changed, 349 insertions(+), 24 deletions(-) create mode 100644 dart-api-examples/tts/bin/matcha-en.dart create mode 100644 dart-api-examples/tts/bin/matcha-zh.dart rename dart-api-examples/tts/bin/{zh.dart => vits-zh.dart} (100%) create mode 100755 dart-api-examples/tts/run-matcha-en.sh create mode 100755 dart-api-examples/tts/run-matcha-zh.sh rename dart-api-examples/tts/{run-zh.sh => run-vits-zh.sh} (92%) diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index 881a4765b..6ba747652 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -4,6 +4,31 @@ set -ex cd dart-api-examples +pushd tts + +echo '----------matcha tts----------' +./run-matcha-zh.sh +./run-matcha-en.sh +ls -lh *.wav +rm -rf matcha-icefall-* +rm *.onnx + +echo '----------piper tts----------' +./run-piper.sh +rm -rf vits-piper-* + +echo '----------coqui tts----------' +./run-coqui.sh +rm -rf vits-coqui-* + +echo '----------zh tts----------' +./run-vits-zh.sh +rm -rf sherpa-onnx-* + +ls -lh *.wav + +popd # tts + pushd speaker-diarization echo '----------speaker diarization----------' ./run.sh @@ -106,22 +131,6 @@ rm -rf sherpa-onnx-* popd # non-streaming-asr -pushd tts - -echo '----------piper tts----------' -./run-piper.sh -rm -rf vits-piper-* - -echo '----------coqui tts----------' -./run-coqui.sh -rm -rf vits-coqui-* - -echo '----------zh tts----------' -./run-zh.sh -rm -rf sherpa-onnx-* - -popd # tts - pushd streaming-asr echo '----------streaming zipformer ctc HLG----------' diff --git a/.github/workflows/checksum.yaml b/.github/workflows/checksum.yaml index 07af3768c..e500209d6 100644 --- a/.github/workflows/checksum.yaml +++ b/.github/workflows/checksum.yaml @@ -7,6 +7,7 @@ on: jobs: checksum: + if: github.repository_owner == 'k2-fsa' runs-on: macos-latest strategy: matrix: diff --git a/dart-api-examples/tts/bin/matcha-en.dart b/dart-api-examples/tts/bin/matcha-en.dart new file mode 100644 index 000000000..fa4c07653 --- /dev/null +++ b/dart-api-examples/tts/bin/matcha-en.dart @@ -0,0 +1,86 @@ +// Copyright (c) 2025 Xiaomi Corporation +import 'dart:io'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('acoustic-model', help: 'Path to the acoustic model') + ..addOption('vocoder', help: 'Path to the vocoder model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption( + 'data-dir', + help: 'Path to espeak-ng-data directory', + defaultsTo: '', + ) + ..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '') + ..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '') + ..addOption('text', help: 'Text to generate TTS for') + ..addOption('output-wav', help: 'Filename to save the generated audio') + ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0') + ..addOption( + 'sid', + help: 'Speaker ID to select. Used only for multi-speaker TTS', + defaultsTo: '0', + ); + final res = parser.parse(arguments); + if (res['acoustic-model'] == null || + res['vocoder'] == null || + res['tokens'] == null || + res['data-dir'] == null || + res['output-wav'] == null || + res['text'] == null) { + print(parser.usage); + exit(1); + } + final acousticModel = res['acoustic-model'] as String; + final vocoder = res['vocoder'] as String; + final tokens = res['tokens'] as String; + final dataDir = res['data-dir'] as String; + final ruleFsts = res['rule-fsts'] as String; + final ruleFars = res['rule-fars'] as String; + final text = res['text'] as String; + final outputWav = res['output-wav'] as String; + var speed = double.tryParse(res['speed'] as String) ?? 1.0; + final sid = int.tryParse(res['sid'] as String) ?? 0; + + if (speed == 0) { + speed = 1.0; + } + + final matcha = sherpa_onnx.OfflineTtsMatchaModelConfig( + acousticModel: acousticModel, + vocoder: vocoder, + tokens: tokens, + dataDir: dataDir, + lengthScale: 1 / speed, + ); + + final modelConfig = sherpa_onnx.OfflineTtsModelConfig( + matcha: matcha, + numThreads: 1, + debug: true, + ); + final config = sherpa_onnx.OfflineTtsConfig( + model: modelConfig, + maxNumSenetences: 1, + ruleFsts: ruleFsts, + ruleFars: ruleFars, + ); + + final tts = sherpa_onnx.OfflineTts(config); + final audio = tts.generate(text: text, sid: sid, speed: speed); + tts.free(); + + sherpa_onnx.writeWave( + filename: outputWav, + samples: audio.samples, + sampleRate: audio.sampleRate, + ); + print('Saved to $outputWav'); +} diff --git a/dart-api-examples/tts/bin/matcha-zh.dart b/dart-api-examples/tts/bin/matcha-zh.dart new file mode 100644 index 000000000..d52175e74 --- /dev/null +++ b/dart-api-examples/tts/bin/matcha-zh.dart @@ -0,0 +1,90 @@ +// Copyright (c) 2025 Xiaomi Corporation +import 'dart:io'; + +import 'package:args/args.dart'; +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; + +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + final parser = ArgParser() + ..addOption('acoustic-model', help: 'Path to the acoustic model') + ..addOption('vocoder', help: 'Path to the vocoder model') + ..addOption('tokens', help: 'Path to tokens.txt') + ..addOption('lexicon', help: 'Path to lexicon.txt') + ..addOption( + 'dict-dir', + help: 'Path to jieba dict directory', + defaultsTo: '', + ) + ..addOption('rule-fsts', help: 'Path to rule fsts', defaultsTo: '') + ..addOption('rule-fars', help: 'Path to rule fars', defaultsTo: '') + ..addOption('text', help: 'Text to generate TTS for') + ..addOption('output-wav', help: 'Filename to save the generated audio') + ..addOption('speed', help: 'Speech speed', defaultsTo: '1.0') + ..addOption( + 'sid', + help: 'Speaker ID to select. Used only for multi-speaker TTS', + defaultsTo: '0', + ); + final res = parser.parse(arguments); + if (res['acoustic-model'] == null || + res['vocoder'] == null || + res['lexicon'] == null || + res['tokens'] == null || + res['dict-dir'] == null || + res['output-wav'] == null || + res['text'] == null) { + print(parser.usage); + exit(1); + } + final acousticModel = res['acoustic-model'] as String; + final vocoder = res['vocoder'] as String; + final lexicon = res['lexicon'] as String; + final tokens = res['tokens'] as String; + final dictDir = res['dict-dir'] as String; + final ruleFsts = res['rule-fsts'] as String; + final ruleFars = res['rule-fars'] as String; + final text = res['text'] as String; + final outputWav = res['output-wav'] as String; + var speed = double.tryParse(res['speed'] as String) ?? 1.0; + final sid = int.tryParse(res['sid'] as String) ?? 0; + + if (speed == 0) { + speed = 1.0; + } + + final matcha = sherpa_onnx.OfflineTtsMatchaModelConfig( + acousticModel: acousticModel, + vocoder: vocoder, + lexicon: lexicon, + tokens: tokens, + dictDir: dictDir, + lengthScale: 1 / speed, + ); + + final modelConfig = sherpa_onnx.OfflineTtsModelConfig( + matcha: matcha, + numThreads: 1, + debug: true, + ); + final config = sherpa_onnx.OfflineTtsConfig( + model: modelConfig, + maxNumSenetences: 1, + ruleFsts: ruleFsts, + ruleFars: ruleFars, + ); + + final tts = sherpa_onnx.OfflineTts(config); + final audio = tts.generate(text: text, sid: sid, speed: speed); + tts.free(); + + sherpa_onnx.writeWave( + filename: outputWav, + samples: audio.samples, + sampleRate: audio.sampleRate, + ); + print('Saved to $outputWav'); +} diff --git a/dart-api-examples/tts/bin/zh.dart b/dart-api-examples/tts/bin/vits-zh.dart similarity index 100% rename from dart-api-examples/tts/bin/zh.dart rename to dart-api-examples/tts/bin/vits-zh.dart diff --git a/dart-api-examples/tts/run-matcha-en.sh b/dart-api-examples/tts/run-matcha-en.sh new file mode 100755 index 000000000..f727ee5c8 --- /dev/null +++ b/dart-api-examples/tts/run-matcha-en.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2 + rm matcha-icefall-en_US-ljspeech.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + +dart run \ + ./bin/matcha-en.dart \ + --acoustic-model ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ + --vocoder ./hifigan_v2.onnx \ + --tokens ./matcha-icefall-en_US-ljspeech/tokens.txt \ + --data-dir ./matcha-icefall-en_US-ljspeech/espeak-ng-data \ + --sid 0 \ + --speed 1.0 \ + --output-wav matcha-en-1.wav \ + --text "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." \ + +ls -lh *.wav diff --git a/dart-api-examples/tts/run-matcha-zh.sh b/dart-api-examples/tts/run-matcha-zh.sh new file mode 100755 index 000000000..be95a827a --- /dev/null +++ b/dart-api-examples/tts/run-matcha-zh.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 + tar xvf matcha-icefall-zh-baker.tar.bz2 + rm matcha-icefall-zh-baker.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + +dart run \ + ./bin/matcha-zh.dart \ + --acoustic-model ./matcha-icefall-zh-baker/model-steps-3.onnx \ + --vocoder ./hifigan_v2.onnx \ + --lexicon ./matcha-icefall-zh-baker/lexicon.txt \ + --tokens ./matcha-icefall-zh-baker/tokens.txt \ + --dict-dir ./matcha-icefall-zh-baker/dict \ + --rule-fsts ./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ + --sid 0 \ + --speed 1.0 \ + --output-wav matcha-zh-1.wav \ + --text "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" \ + +dart run \ + ./bin/matcha-zh.dart \ + --acoustic-model ./matcha-icefall-zh-baker/model-steps-3.onnx \ + --vocoder ./hifigan_v2.onnx \ + --lexicon ./matcha-icefall-zh-baker/lexicon.txt \ + --tokens ./matcha-icefall-zh-baker/tokens.txt \ + --dict-dir ./matcha-icefall-zh-baker/dict \ + --sid 0 \ + --speed 1.0 \ + --output-wav matcha-zh-2.wav \ + --text "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔." \ + +ls -lh *.wav diff --git a/dart-api-examples/tts/run-zh.sh b/dart-api-examples/tts/run-vits-zh.sh similarity index 92% rename from dart-api-examples/tts/run-zh.sh rename to dart-api-examples/tts/run-vits-zh.sh index 057260b61..2298f9eb1 100755 --- a/dart-api-examples/tts/run-zh.sh +++ b/dart-api-examples/tts/run-vits-zh.sh @@ -16,7 +16,7 @@ if [[ ! -f ./sherpa-onnx-vits-zh-ll/tokens.txt ]]; then fi dart run \ - ./bin/zh.dart \ + ./bin/vits-zh.dart \ --model ./sherpa-onnx-vits-zh-ll/model.onnx \ --lexicon ./sherpa-onnx-vits-zh-ll/lexicon.txt \ --tokens ./sherpa-onnx-vits-zh-ll/tokens.txt \ @@ -24,10 +24,10 @@ dart run \ --sid 2 \ --speed 1.0 \ --text '当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。' \ - --output-wav zh-jieba-2.wav + --output-wav vits-zh-jieba-2.wav dart run \ - ./bin/zh.dart \ + ./bin/vits-zh.dart \ --model ./sherpa-onnx-vits-zh-ll/model.onnx \ --lexicon ./sherpa-onnx-vits-zh-ll/lexicon.txt \ --tokens ./sherpa-onnx-vits-zh-ll/tokens.txt \ @@ -36,6 +36,6 @@ dart run \ --sid 3 \ --speed 1.0 \ --text '今天是2024年6月15号,13点23分。如果有困难,请拨打110或者18920240511。123456块钱。' \ - --output-wav zh-jieba-3.wav + --output-wav vits-zh-jieba-3.wav ls -lh *.wav diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index 0d463cf6d..7baf53f26 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -131,6 +131,22 @@ final class SherpaOnnxOfflineTtsVitsModelConfig extends Struct { external Pointer dictDir; } +final class SherpaOnnxOfflineTtsMatchaModelConfig extends Struct { + external Pointer acousticModel; + external Pointer vocoder; + external Pointer lexicon; + external Pointer tokens; + external Pointer dataDir; + + @Float() + external double noiseScale; + + @Float() + external double lengthScale; + + external Pointer dictDir; +} + final class SherpaOnnxOfflineTtsModelConfig extends Struct { external SherpaOnnxOfflineTtsVitsModelConfig vits; @Int32() @@ -140,6 +156,7 @@ final class SherpaOnnxOfflineTtsModelConfig extends Struct { external int debug; external Pointer provider; + external SherpaOnnxOfflineTtsMatchaModelConfig matcha; } final class SherpaOnnxOfflineTtsConfig extends Struct { diff --git a/flutter/sherpa_onnx/lib/src/tts.dart b/flutter/sherpa_onnx/lib/src/tts.dart index 2f550e273..b5dcda48d 100644 --- a/flutter/sherpa_onnx/lib/src/tts.dart +++ b/flutter/sherpa_onnx/lib/src/tts.dart @@ -8,9 +8,9 @@ import './sherpa_onnx_bindings.dart'; class OfflineTtsVitsModelConfig { const OfflineTtsVitsModelConfig({ - required this.model, + this.model = '', this.lexicon = '', - required this.tokens, + this.tokens = '', this.dataDir = '', this.noiseScale = 0.667, this.noiseScaleW = 0.8, @@ -33,9 +33,37 @@ class OfflineTtsVitsModelConfig { final String dictDir; } +class OfflineTtsMatchaModelConfig { + const OfflineTtsMatchaModelConfig({ + this.acousticModel = '', + this.vocoder = '', + this.lexicon = '', + this.tokens = '', + this.dataDir = '', + this.noiseScale = 0.667, + this.lengthScale = 1.0, + this.dictDir = '', + }); + + @override + String toString() { + return 'OfflineTtsMatchaModelConfig(acousticModel: $acousticModel, vocoder: $vocoder, lexicon: $lexicon, tokens: $tokens, dataDir: $dataDir, noiseScale: $noiseScale, lengthScale: $lengthScale, dictDir: $dictDir)'; + } + + final String acousticModel; + final String vocoder; + final String lexicon; + final String tokens; + final String dataDir; + final double noiseScale; + final double lengthScale; + final String dictDir; +} + class OfflineTtsModelConfig { const OfflineTtsModelConfig({ - required this.vits, + this.vits = const OfflineTtsVitsModelConfig(), + this.matcha = const OfflineTtsMatchaModelConfig(), this.numThreads = 1, this.debug = true, this.provider = 'cpu', @@ -43,10 +71,11 @@ class OfflineTtsModelConfig { @override String toString() { - return 'OfflineTtsModelConfig(vits: $vits, numThreads: $numThreads, debug: $debug, provider: $provider)'; + return 'OfflineTtsModelConfig(vits: $vits, matcha: $matcha, numThreads: $numThreads, debug: $debug, provider: $provider)'; } final OfflineTtsVitsModelConfig vits; + final OfflineTtsMatchaModelConfig matcha; final int numThreads; final bool debug; final String provider; @@ -99,6 +128,16 @@ class OfflineTts { c.ref.model.vits.lengthScale = config.model.vits.lengthScale; c.ref.model.vits.dictDir = config.model.vits.dictDir.toNativeUtf8(); + c.ref.model.matcha.acousticModel = + config.model.matcha.acousticModel.toNativeUtf8(); + c.ref.model.matcha.vocoder = config.model.matcha.vocoder.toNativeUtf8(); + c.ref.model.matcha.lexicon = config.model.matcha.lexicon.toNativeUtf8(); + c.ref.model.matcha.tokens = config.model.matcha.tokens.toNativeUtf8(); + c.ref.model.matcha.dataDir = config.model.matcha.dataDir.toNativeUtf8(); + c.ref.model.matcha.noiseScale = config.model.matcha.noiseScale; + c.ref.model.matcha.lengthScale = config.model.matcha.lengthScale; + c.ref.model.matcha.dictDir = config.model.matcha.dictDir.toNativeUtf8(); + c.ref.model.numThreads = config.model.numThreads; c.ref.model.debug = config.model.debug ? 1 : 0; c.ref.model.provider = config.model.provider.toNativeUtf8(); @@ -112,6 +151,12 @@ class OfflineTts { calloc.free(c.ref.ruleFars); calloc.free(c.ref.ruleFsts); calloc.free(c.ref.model.provider); + calloc.free(c.ref.model.matcha.dictDir); + calloc.free(c.ref.model.matcha.dataDir); + calloc.free(c.ref.model.matcha.tokens); + calloc.free(c.ref.model.matcha.lexicon); + calloc.free(c.ref.model.matcha.vocoder); + calloc.free(c.ref.model.matcha.acousticModel); calloc.free(c.ref.model.vits.dictDir); calloc.free(c.ref.model.vits.dataDir); calloc.free(c.ref.model.vits.tokens);