diff --git a/.github/workflows/test-go-package.yaml b/.github/workflows/test-go-package.yaml index 649735699..95c868b8f 100644 --- a/.github/workflows/test-go-package.yaml +++ b/.github/workflows/test-go-package.yaml @@ -209,6 +209,15 @@ jobs: go build ls -lh + echo "Test matcha zh" + ./run-matcha-zh.sh + rm -rf matcha-icefall-* + + echo "Test matcha en" + ./run-matcha-en.sh + rm -rf matcha-icefall-* + ls -lh *.wav + echo "Test vits-ljs" ./run-vits-ljs.sh rm -rf vits-ljs @@ -246,6 +255,15 @@ jobs: cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/x86_64-pc-windows-gnu/*.dll . ls -lh + echo "Test matcha zh" + ./run-matcha-zh.sh + rm -rf matcha-icefall-* + + echo "Test matcha en" + ./run-matcha-en.sh + rm -rf matcha-icefall-* + ls -lh *.wav + echo "Test vits-ljs" ./run-vits-ljs.sh rm -rf vits-ljs @@ -291,6 +309,15 @@ jobs: cp -v /C/Users/runneradmin/go/pkg/mod/github.com/k2-fsa/sherpa-onnx-go-windows*/lib/i686-pc-windows-gnu/*.dll . ls -lh + echo "Test matcha zh" + ./run-matcha-zh.sh + rm -rf matcha-icefall-* + + echo "Test matcha en" + ./run-matcha-en.sh + rm -rf matcha-icefall-* + ls -lh *.wav + echo "Test vits-ljs" ./run-vits-ljs.sh rm -rf vits-ljs diff --git a/.github/workflows/test-go.yaml b/.github/workflows/test-go.yaml index eaf561818..440402939 100644 --- a/.github/workflows/test-go.yaml +++ b/.github/workflows/test-go.yaml @@ -226,6 +226,15 @@ jobs: go build ls -lh + echo "Test matcha zh" + ./run-matcha-zh.sh + rm -rf matcha-icefall-* + + echo "Test matcha en" + ./run-matcha-en.sh + rm -rf matcha-icefall-* + ls -lh *.wav + echo "Test vits-ljs" ./run-vits-ljs.sh rm -rf vits-ljs diff --git a/go-api-examples/non-streaming-tts/main.go b/go-api-examples/non-streaming-tts/main.go index 0ddeb8fe4..73638d8f4 100644 --- a/go-api-examples/non-streaming-tts/main.go +++ b/go-api-examples/non-streaming-tts/main.go @@ -17,11 +17,22 @@ func main() { flag.StringVar(&config.Model.Vits.Lexicon, "vits-lexicon", "", "Path to lexicon.txt") flag.StringVar(&config.Model.Vits.Tokens, "vits-tokens", "", "Path to tokens.txt") flag.StringVar(&config.Model.Vits.DataDir, "vits-data-dir", "", "Path to espeak-ng-data") + flag.StringVar(&config.Model.Matcha.DictDir, "vits-dict-dir", "", "Path to dict for jieba") flag.Float32Var(&config.Model.Vits.NoiseScale, "vits-noise-scale", 0.667, "noise_scale for VITS") flag.Float32Var(&config.Model.Vits.NoiseScaleW, "vits-noise-scale-w", 0.8, "noise_scale_w for VITS") flag.Float32Var(&config.Model.Vits.LengthScale, "vits-length-scale", 1.0, "length_scale for VITS. small -> faster in speech speed; large -> slower") + flag.StringVar(&config.Model.Matcha.AcousticModel, "matcha-acoustic-model", "", "Path to the matcha acoustic model") + flag.StringVar(&config.Model.Matcha.Vocoder, "matcha-vocoder", "", "Path to the matcha vocoder model") + flag.StringVar(&config.Model.Matcha.Lexicon, "matcha-lexicon", "", "Path to lexicon.txt") + flag.StringVar(&config.Model.Matcha.Tokens, "matcha-tokens", "", "Path to tokens.txt") + flag.StringVar(&config.Model.Matcha.DataDir, "matcha-data-dir", "", "Path to espeak-ng-data") + flag.StringVar(&config.Model.Matcha.DictDir, "matcha-dict-dir", "", "Path to dict for jieba") + + flag.Float32Var(&config.Model.Matcha.NoiseScale, "matcha-noise-scale", 0.667, "noise_scale for Matcha") + flag.Float32Var(&config.Model.Matcha.LengthScale, "matcha-length-scale", 1.0, "length_scale for Matcha. small -> faster in speech speed; large -> slower") + flag.IntVar(&config.Model.NumThreads, "num-threads", 1, "Number of threads for computing") flag.IntVar(&config.Model.Debug, "debug", 0, "Whether to show debug message") flag.StringVar(&config.Model.Provider, "provider", "cpu", "Provider to use") diff --git a/go-api-examples/non-streaming-tts/run-matcha-en.sh b/go-api-examples/non-streaming-tts/run-matcha-en.sh new file mode 100755 index 000000000..f0932da56 --- /dev/null +++ b/go-api-examples/non-streaming-tts/run-matcha-en.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +set -ex + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 + tar xf matcha-icefall-en_US-ljspeech.tar.bz2 + rm matcha-icefall-en_US-ljspeech.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + +go mod tidy +go build + +./non-streaming-tts \ + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ + --debug=1 \ + --output-filename=./test-matcha-en.wav \ + "Friends fell out often because life was changing so fast. The easiest thing in the world was to lose touch with someone." + + diff --git a/go-api-examples/non-streaming-tts/run-matcha-zh.sh b/go-api-examples/non-streaming-tts/run-matcha-zh.sh new file mode 100755 index 000000000..ef4165d04 --- /dev/null +++ b/go-api-examples/non-streaming-tts/run-matcha-zh.sh @@ -0,0 +1,31 @@ +#!/usr/bin/env bash + +set -ex + +# please visit +# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker +# to download more models +if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 + tar xvf matcha-icefall-zh-baker.tar.bz2 + rm matcha-icefall-zh-baker.tar.bz2 +fi + +if [ ! -f ./hifigan_v2.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx +fi + +go mod tidy +go build + +./non-streaming-tts \ + --matcha-acoustic-model=./matcha-icefall-zh-baker/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --matcha-lexicon=./matcha-icefall-zh-baker/lexicon.txt \ + --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \ + --matcha-dict-dir=./matcha-icefall-zh-baker/dict \ + --debug=1 \ + --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ + --output-filename=./test-matcha-zh.wav \ + "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" + diff --git a/go-api-examples/non-streaming-tts/run-vits-piper-en_US-lessac-medium.sh b/go-api-examples/non-streaming-tts/run-vits-piper-en_US-lessac-medium.sh index 15e4f1dbd..6f8c98e80 100755 --- a/go-api-examples/non-streaming-tts/run-vits-piper-en_US-lessac-medium.sh +++ b/go-api-examples/non-streaming-tts/run-vits-piper-en_US-lessac-medium.sh @@ -4,7 +4,7 @@ set -ex if [ ! -d vits-piper-en_US-lessac-medium ]; then curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-lessac-medium.tar.bz2 - tar xvf vits-piper-en_US-lessac-medium.tar.bz2 + tar xf vits-piper-en_US-lessac-medium.tar.bz2 rm vits-piper-en_US-lessac-medium.tar.bz2 fi diff --git a/scripts/go/_internal/non-streaming-tts/run-matcha-en.sh b/scripts/go/_internal/non-streaming-tts/run-matcha-en.sh new file mode 120000 index 000000000..013e5bacc --- /dev/null +++ b/scripts/go/_internal/non-streaming-tts/run-matcha-en.sh @@ -0,0 +1 @@ +../../../../go-api-examples/non-streaming-tts/run-matcha-en.sh \ No newline at end of file diff --git a/scripts/go/_internal/non-streaming-tts/run-matcha-zh.sh b/scripts/go/_internal/non-streaming-tts/run-matcha-zh.sh new file mode 120000 index 000000000..73ef170ee --- /dev/null +++ b/scripts/go/_internal/non-streaming-tts/run-matcha-zh.sh @@ -0,0 +1 @@ +../../../../go-api-examples/non-streaming-tts/run-matcha-zh.sh \ No newline at end of file diff --git a/scripts/go/sherpa_onnx.go b/scripts/go/sherpa_onnx.go index 17afd32f2..763752840 100644 --- a/scripts/go/sherpa_onnx.go +++ b/scripts/go/sherpa_onnx.go @@ -671,8 +671,20 @@ type OfflineTtsVitsModelConfig struct { DictDir string // Path to dict directory for jieba (used only in Chinese tts) } +type OfflineTtsMatchaModelConfig struct { + AcousticModel string // Path to the acoustic model for MatchaTTS + Vocoder string // Path to the vocoder model for MatchaTTS + Lexicon string // Path to lexicon.txt + Tokens string // Path to tokens.txt + DataDir string // Path to espeak-ng-data directory + NoiseScale float32 // noise scale for vits models. Please use 0.667 in general + LengthScale float32 // Please use 1.0 in general. Smaller -> Faster speech speed. Larger -> Slower speech speed + DictDir string // Path to dict directory for jieba (used only in Chinese tts) +} + type OfflineTtsModelConfig struct { - Vits OfflineTtsVitsModelConfig + Vits OfflineTtsVitsModelConfig + Matcha OfflineTtsMatchaModelConfig // Number of threads to use for neural network computation NumThreads int @@ -722,6 +734,7 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { c.max_num_sentences = C.int(config.MaxNumSentences) + // vits c.model.vits.model = C.CString(config.Model.Vits.Model) defer C.free(unsafe.Pointer(c.model.vits.model)) @@ -741,6 +754,28 @@ func NewOfflineTts(config *OfflineTtsConfig) *OfflineTts { c.model.vits.dict_dir = C.CString(config.Model.Vits.DictDir) defer C.free(unsafe.Pointer(c.model.vits.dict_dir)) + // matcha + c.model.matcha.acoustic_model = C.CString(config.Model.Matcha.AcousticModel) + defer C.free(unsafe.Pointer(c.model.matcha.acoustic_model)) + + c.model.matcha.vocoder = C.CString(config.Model.Matcha.Vocoder) + defer C.free(unsafe.Pointer(c.model.matcha.vocoder)) + + c.model.matcha.lexicon = C.CString(config.Model.Matcha.Lexicon) + defer C.free(unsafe.Pointer(c.model.matcha.lexicon)) + + c.model.matcha.tokens = C.CString(config.Model.Matcha.Tokens) + defer C.free(unsafe.Pointer(c.model.matcha.tokens)) + + c.model.matcha.data_dir = C.CString(config.Model.Matcha.DataDir) + defer C.free(unsafe.Pointer(c.model.matcha.data_dir)) + + c.model.matcha.noise_scale = C.float(config.Model.Matcha.NoiseScale) + c.model.matcha.length_scale = C.float(config.Model.Matcha.LengthScale) + + c.model.matcha.dict_dir = C.CString(config.Model.Matcha.DictDir) + defer C.free(unsafe.Pointer(c.model.matcha.dict_dir)) + c.model.num_threads = C.int(config.Model.NumThreads) c.model.debug = C.int(config.Model.Debug)