From f457baea42b3efedfb55a8e4cda1d9b1538512ac Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Thu, 2 Jan 2025 13:46:43 +0800 Subject: [PATCH] Support Matcha-TTS models using espeak-ng (#1672) --- .github/scripts/test-offline-tts.sh | 22 +++ .github/scripts/test-python.sh | 25 ++- python-api-examples/offline-tts-play.py | 25 ++- python-api-examples/offline-tts.py | 25 ++- sherpa-onnx/csrc/macros.h | 32 ++-- sherpa-onnx/csrc/offline-tts-matcha-impl.h | 41 ++++- .../csrc/offline-tts-matcha-model-metadata.h | 2 +- sherpa-onnx/csrc/offline-tts-matcha-model.cc | 2 +- sherpa-onnx/csrc/piper-phonemize-lexicon.cc | 147 +++++++++++++++--- sherpa-onnx/csrc/piper-phonemize-lexicon.h | 24 ++- 10 files changed, 288 insertions(+), 57 deletions(-) diff --git a/.github/scripts/test-offline-tts.sh b/.github/scripts/test-offline-tts.sh index 1aa0340a0..70fd2247e 100755 --- a/.github/scripts/test-offline-tts.sh +++ b/.github/scripts/test-offline-tts.sh @@ -18,6 +18,28 @@ which $EXE # test waves are saved in ./tts mkdir ./tts +log "------------------------------------------------------------" +log "matcha-icefall-en_US-ljspeech" +log "------------------------------------------------------------" +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 +rm matcha-icefall-en_US-ljspeech.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +$EXE \ + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ + --num-threads=2 \ + --output-filename=./tts/matcha-ljspeech-1.wav \ + --debug=1 \ + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." + +rm hifigan_v2.onnx +rm -rf matcha-icefall-en_US-ljspeech + log "------------------------------------------------------------" log "matcha-icefall-zh-baker" log "------------------------------------------------------------" diff --git a/.github/scripts/test-python.sh b/.github/scripts/test-python.sh index 8bfe2c16f..350d9c185 100755 --- a/.github/scripts/test-python.sh +++ b/.github/scripts/test-python.sh @@ -267,7 +267,27 @@ log "Offline TTS test" # test waves are saved in ./tts mkdir ./tts -log "vits-ljs test" +log "matcha-ljspeech-en test" + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 +rm matcha-icefall-en_US-ljspeech.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +python3 ./python-api-examples/offline-tts.py \ + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ + --output-filename=./tts/test-matcha-ljspeech-en.wav \ + --num-threads=2 \ + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." + +rm hifigan_v2.onnx +rm -rf matcha-icefall-en_US-ljspeech + +log "matcha-baker-zh test" curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 tar xvf matcha-icefall-zh-baker.tar.bz2 @@ -282,12 +302,13 @@ python3 ./python-api-examples/offline-tts.py \ --matcha-tokens=./matcha-icefall-zh-baker/tokens.txt \ --tts-rule-fsts=./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst \ --matcha-dict-dir=./matcha-icefall-zh-baker/dict \ - --output-filename=./tts/test-matcha.wav \ + --output-filename=./tts/test-matcha-baker-zh.wav \ "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" rm -rf matcha-icefall-zh-baker rm hifigan_v2.onnx +log "vits-ljs test" curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/vits-ljs.onnx curl -LS -O https://huggingface.co/csukuangfj/vits-ljs/resolve/main/lexicon.txt diff --git a/python-api-examples/offline-tts-play.py b/python-api-examples/offline-tts-play.py index e8350ea47..09d03dae6 100755 --- a/python-api-examples/offline-tts-play.py +++ b/python-api-examples/offline-tts-play.py @@ -11,7 +11,7 @@ Usage: -Example (1/4) +Example (1/5) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 tar xf vits-piper-en_US-amy-low.tar.bz2 @@ -23,7 +23,7 @@ --output-filename=./generated.wav \ "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." -Example (2/4) +Example (2/5) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-zh-aishell3.tar.bz2 tar xvf vits-zh-aishell3.tar.bz2 @@ -37,7 +37,7 @@ --output-filename=./liubei-21.wav \ "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" -Example (3/4) +Example (3/5) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 @@ -53,7 +53,7 @@ --output-filename=./test-2.wav \ "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" -Example (4/4) +Example (4/5) curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 tar xvf matcha-icefall-zh-baker.tar.bz2 @@ -71,6 +71,23 @@ --output-filename=./test-matcha.wav \ "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" +Example (5/5) + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 +rm matcha-icefall-en_US-ljspeech.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +python3 ./python-api-examples/offline-tts-play.py \ + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ + --output-filename=./test-matcha-ljspeech-en.wav \ + --num-threads=2 \ + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." + You can find more models at https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models diff --git a/python-api-examples/offline-tts.py b/python-api-examples/offline-tts.py index aa1cce935..72bf77959 100755 --- a/python-api-examples/offline-tts.py +++ b/python-api-examples/offline-tts.py @@ -12,7 +12,7 @@ Usage: -Example (1/4) +Example (1/5) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-piper-en_US-amy-low.tar.bz2 tar xf vits-piper-en_US-amy-low.tar.bz2 @@ -24,7 +24,7 @@ --output-filename=./generated.wav \ "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." -Example (2/4) +Example (2/5) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-icefall-zh-aishell3.tar.bz2 tar xvf vits-icefall-zh-aishell3.tar.bz2 @@ -38,7 +38,7 @@ --output-filename=./liubei-21.wav \ "勿以恶小而为之,勿以善小而不为。惟贤惟德,能服于人。122334" -Example (3/4) +Example (3/5) wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/sherpa-onnx-vits-zh-ll.tar.bz2 tar xvf sherpa-onnx-vits-zh-ll.tar.bz2 @@ -54,7 +54,7 @@ --output-filename=./test-2.wav \ "当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感受着生命的奇迹与温柔。2024年5月11号,拨打110或者18920240511。123456块钱。" -Example (4/4) +Example (4/5) curl -O -SL https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 tar xvf matcha-icefall-zh-baker.tar.bz2 @@ -72,6 +72,23 @@ --output-filename=./test-matcha.wav \ "某某银行的副行长和一些行政领导表示,他们去过长江和长白山; 经济不断增长。2024年12月31号,拨打110或者18920240511。123456块钱。" +Example (5/5) + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 +tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 +rm matcha-icefall-en_US-ljspeech.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx + +python3 ./python-api-examples/offline-tts.py \ + --matcha-acoustic-model=./matcha-icefall-en_US-ljspeech/model-steps-3.onnx \ + --matcha-vocoder=./hifigan_v2.onnx \ + --matcha-tokens=./matcha-icefall-en_US-ljspeech/tokens.txt \ + --matcha-data-dir=./matcha-icefall-en_US-ljspeech/espeak-ng-data \ + --output-filename=./test-matcha-ljspeech-en.wav \ + --num-threads=2 \ + "Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar." + You can find more models at https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models diff --git a/sherpa-onnx/csrc/macros.h b/sherpa-onnx/csrc/macros.h index ac11d6a79..2788292df 100644 --- a/sherpa-onnx/csrc/macros.h +++ b/sherpa-onnx/csrc/macros.h @@ -49,19 +49,21 @@ } while (0) #endif +#define SHERPA_ONNX_EXIT(code) exit(code) + // Read an integer #define SHERPA_ONNX_READ_META_DATA(dst, src_key) \ do { \ auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ if (value.empty()) { \ SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ \ dst = atoi(value.c_str()); \ if (dst < 0) { \ SHERPA_ONNX_LOGE("Invalid value %d for '%s'", dst, src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ } while (0) @@ -74,7 +76,7 @@ dst = atoi(value.c_str()); \ if (dst < 0) { \ SHERPA_ONNX_LOGE("Invalid value %d for '%s'", dst, src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ } \ } while (0) @@ -85,13 +87,13 @@ auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ if (value.empty()) { \ SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ \ bool ret = SplitStringToIntegers(value.c_str(), ",", true, &dst); \ if (!ret) { \ SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'", value.c_str(), src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ } while (0) @@ -101,13 +103,13 @@ auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ if (value.empty()) { \ SHERPA_ONNX_LOGE("%s does not exist in the metadata", src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ \ bool ret = SplitStringToFloats(value.c_str(), ",", true, &dst); \ if (!ret) { \ SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'", value.c_str(), src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ } while (0) @@ -117,14 +119,14 @@ auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ if (value.empty()) { \ SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ SplitStringToVector(value.c_str(), ",", false, &dst); \ \ if (dst.empty()) { \ SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'. Empty vector!", \ value.c_str(), src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ } while (0) @@ -134,14 +136,14 @@ auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ if (value.empty()) { \ SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ SplitStringToVector(value.c_str(), sep, false, &dst); \ \ if (dst.empty()) { \ SHERPA_ONNX_LOGE("Invalid value '%s' for '%s'. Empty vector!", \ value.c_str(), src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ } while (0) @@ -151,13 +153,13 @@ auto value = LookupCustomModelMetaData(meta_data, src_key, allocator); \ if (value.empty()) { \ SHERPA_ONNX_LOGE("'%s' does not exist in the metadata", src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ \ dst = std::move(value); \ if (dst.empty()) { \ SHERPA_ONNX_LOGE("Invalid value for '%s'\n", src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ } while (0) @@ -178,11 +180,9 @@ dst = std::move(value); \ if (dst.empty()) { \ SHERPA_ONNX_LOGE("Invalid value for '%s'\n", src_key); \ - exit(-1); \ + SHERPA_ONNX_EXIT(-1); \ } \ } \ } while (0) -#define SHERPA_ONNX_EXIT(code) exit(code) - #endif // SHERPA_ONNX_CSRC_MACROS_H_ diff --git a/sherpa-onnx/csrc/offline-tts-matcha-impl.h b/sherpa-onnx/csrc/offline-tts-matcha-impl.h index 62c29bb83..a4f47fadb 100644 --- a/sherpa-onnx/csrc/offline-tts-matcha-impl.h +++ b/sherpa-onnx/csrc/offline-tts-matcha-impl.h @@ -321,12 +321,45 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { private: template - void InitFrontend(Manager *mgr) {} + void InitFrontend(Manager *mgr) { + // for piper phonemizer + // we require that you copy espeak_ng_data + // from assets to disk + // + // for jieba + // we require that you copy tokens.txt, lexicon.txt and dict + // from assets to disk + const auto &meta_data = model_->GetMetaData(); + + if (meta_data.jieba && !meta_data.has_espeak) { + frontend_ = std::make_unique( + config_.model.matcha.lexicon, config_.model.matcha.tokens, + config_.model.matcha.dict_dir, config_.model.debug); + } else if (meta_data.has_espeak && !meta_data.jieba) { + frontend_ = std::make_unique( + mgr, config_.model.matcha.tokens, config_.model.matcha.data_dir, + meta_data); + } else { + SHERPA_ONNX_LOGE("jieba + espeaker-ng is not supported yet"); + SHERPA_ONNX_EXIT(-1); + } + } void InitFrontend() { - frontend_ = std::make_unique( - config_.model.matcha.lexicon, config_.model.matcha.tokens, - config_.model.matcha.dict_dir, config_.model.debug); + const auto &meta_data = model_->GetMetaData(); + + if (meta_data.jieba && !meta_data.has_espeak) { + frontend_ = std::make_unique( + config_.model.matcha.lexicon, config_.model.matcha.tokens, + config_.model.matcha.dict_dir, config_.model.debug); + } else if (meta_data.has_espeak && !meta_data.jieba) { + frontend_ = std::make_unique( + config_.model.matcha.tokens, config_.model.matcha.data_dir, + meta_data); + } else { + SHERPA_ONNX_LOGE("jieba + espeaker-ng is not supported yet"); + SHERPA_ONNX_EXIT(-1); + } } GeneratedAudio Process(const std::vector> &tokens, diff --git a/sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h b/sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h index 3147985dd..c5cee9465 100644 --- a/sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h +++ b/sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h @@ -18,7 +18,7 @@ struct OfflineTtsMatchaModelMetaData { int32_t num_speakers = 0; int32_t version = 1; int32_t jieba = 0; - int32_t espeak = 0; + int32_t has_espeak = 0; int32_t use_eos_bos = 0; int32_t pad_id = 0; }; diff --git a/sherpa-onnx/csrc/offline-tts-matcha-model.cc b/sherpa-onnx/csrc/offline-tts-matcha-model.cc index 066dbd21a..afea546d0 100644 --- a/sherpa-onnx/csrc/offline-tts-matcha-model.cc +++ b/sherpa-onnx/csrc/offline-tts-matcha-model.cc @@ -142,7 +142,7 @@ class OfflineTtsMatchaModel::Impl { SHERPA_ONNX_READ_META_DATA_WITH_DEFAULT(meta_data_.version, "version", 1); SHERPA_ONNX_READ_META_DATA(meta_data_.num_speakers, "n_speakers"); SHERPA_ONNX_READ_META_DATA(meta_data_.jieba, "jieba"); - SHERPA_ONNX_READ_META_DATA(meta_data_.espeak, "has_espeak"); + SHERPA_ONNX_READ_META_DATA(meta_data_.has_espeak, "has_espeak"); SHERPA_ONNX_READ_META_DATA(meta_data_.use_eos_bos, "use_eos_bos"); SHERPA_ONNX_READ_META_DATA(meta_data_.pad_id, "pad_id"); } diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc index 298274654..9bc93ce98 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.cc +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.cc @@ -32,6 +32,18 @@ namespace sherpa_onnx { +static void CallPhonemizeEspeak( + const std::string &text, + piper::eSpeakPhonemeConfig &config, // NOLINT + std::vector> *phonemes) { + static std::mutex espeak_mutex; + + std::lock_guard lock(espeak_mutex); + + // keep multi threads from calling into piper::phonemize_eSpeak + piper::phonemize_eSpeak(text, config, *phonemes); +} + static std::unordered_map ReadTokens(std::istream &is) { std::wstring_convert, char32_t> conv; std::unordered_map token2id; @@ -87,7 +99,7 @@ static std::unordered_map ReadTokens(std::istream &is) { // see the function "phonemes_to_ids" from // https://github.com/rhasspy/piper/blob/master/notebooks/piper_inference_(ONNX).ipynb -static std::vector PiperPhonemesToIds( +static std::vector PiperPhonemesToIdsVits( const std::unordered_map &token2id, const std::vector &phonemes) { // see @@ -114,17 +126,46 @@ static std::vector PiperPhonemesToIds( return ans; } +static std::vector PiperPhonemesToIdsMatcha( + const std::unordered_map &token2id, + const std::vector &phonemes, bool use_eos_bos) { + std::vector ans; + ans.reserve(phonemes.size()); + + int32_t bos = token2id.at(U'^'); + int32_t eos = token2id.at(U'$'); + + if (use_eos_bos) { + ans.push_back(bos); + } + + for (auto p : phonemes) { + if (token2id.count(p)) { + ans.push_back(token2id.at(p)); + } else { + SHERPA_ONNX_LOGE("Skip unknown phonemes. Unicode codepoint: \\U+%04x.", + static_cast(p)); + } + } + + if (use_eos_bos) { + ans.push_back(eos); + } + + return ans; +} + static std::vector CoquiPhonemesToIds( const std::unordered_map &token2id, const std::vector &phonemes, - const OfflineTtsVitsModelMetaData &meta_data) { + const OfflineTtsVitsModelMetaData &vits_meta_data) { // see // https://github.com/coqui-ai/TTS/blob/dev/TTS/tts/utils/text/tokenizer.py#L87 - int32_t use_eos_bos = meta_data.use_eos_bos; - int32_t bos_id = meta_data.bos_id; - int32_t eos_id = meta_data.eos_id; - int32_t blank_id = meta_data.blank_id; - int32_t add_blank = meta_data.add_blank; + int32_t use_eos_bos = vits_meta_data.use_eos_bos; + int32_t bos_id = vits_meta_data.bos_id; + int32_t eos_id = vits_meta_data.eos_id; + int32_t blank_id = vits_meta_data.blank_id; + int32_t add_blank = vits_meta_data.add_blank; int32_t comma_id = token2id.at(','); std::vector ans; @@ -189,8 +230,37 @@ static void InitEspeak(const std::string &data_dir) { PiperPhonemizeLexicon::PiperPhonemizeLexicon( const std::string &tokens, const std::string &data_dir, - const OfflineTtsVitsModelMetaData &meta_data) - : meta_data_(meta_data) { + const OfflineTtsVitsModelMetaData &vits_meta_data) + : vits_meta_data_(vits_meta_data) { + { + std::ifstream is(tokens); + token2id_ = ReadTokens(is); + } + + InitEspeak(data_dir); +} + +template +PiperPhonemizeLexicon::PiperPhonemizeLexicon( + Manager *mgr, const std::string &tokens, const std::string &data_dir, + const OfflineTtsVitsModelMetaData &vits_meta_data) + : vits_meta_data_(vits_meta_data) { + { + auto buf = ReadFile(mgr, tokens); + std::istrstream is(buf.data(), buf.size()); + token2id_ = ReadTokens(is); + } + + // We should copy the directory of espeak-ng-data from the asset to + // some internal or external storage and then pass the directory to + // data_dir. + InitEspeak(data_dir); +} + +PiperPhonemizeLexicon::PiperPhonemizeLexicon( + const std::string &tokens, const std::string &data_dir, + const OfflineTtsMatchaModelMetaData &matcha_meta_data) + : matcha_meta_data_(matcha_meta_data), is_matcha_(true) { { std::ifstream is(tokens); token2id_ = ReadTokens(is); @@ -202,8 +272,8 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( template PiperPhonemizeLexicon::PiperPhonemizeLexicon( Manager *mgr, const std::string &tokens, const std::string &data_dir, - const OfflineTtsVitsModelMetaData &meta_data) - : meta_data_(meta_data) { + const OfflineTtsMatchaModelMetaData &matcha_meta_data) + : matcha_meta_data_(matcha_meta_data), is_matcha_(true) { { auto buf = ReadFile(mgr, tokens); std::istrstream is(buf.data(), buf.size()); @@ -218,6 +288,15 @@ PiperPhonemizeLexicon::PiperPhonemizeLexicon( std::vector PiperPhonemizeLexicon::ConvertTextToTokenIds( const std::string &text, const std::string &voice /*= ""*/) const { + if (is_matcha_) { + return ConvertTextToTokenIdsMatcha(text, voice); + } else { + return ConvertTextToTokenIdsVits(text, voice); + } +} + +std::vector PiperPhonemizeLexicon::ConvertTextToTokenIdsMatcha( + const std::string &text, const std::string &voice /*= ""*/) const { piper::eSpeakPhonemeConfig config; // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices @@ -226,26 +305,45 @@ std::vector PiperPhonemizeLexicon::ConvertTextToTokenIds( std::vector> phonemes; - static std::mutex espeak_mutex; - { - std::lock_guard lock(espeak_mutex); + CallPhonemizeEspeak(text, config, &phonemes); - // keep multi threads from calling into piper::phonemize_eSpeak - piper::phonemize_eSpeak(text, config, phonemes); + std::vector ans; + + std::vector phoneme_ids; + + for (const auto &p : phonemes) { + phoneme_ids = + PiperPhonemesToIdsMatcha(token2id_, p, matcha_meta_data_.use_eos_bos); + ans.emplace_back(std::move(phoneme_ids)); } + return ans; +} + +std::vector PiperPhonemizeLexicon::ConvertTextToTokenIdsVits( + const std::string &text, const std::string &voice /*= ""*/) const { + piper::eSpeakPhonemeConfig config; + + // ./bin/espeak-ng-bin --path ./install/share/espeak-ng-data/ --voices + // to list available voices + config.voice = voice; // e.g., voice is en-us + + std::vector> phonemes; + + CallPhonemizeEspeak(text, config, &phonemes); + std::vector ans; std::vector phoneme_ids; - if (meta_data_.is_piper || meta_data_.is_icefall) { + if (vits_meta_data_.is_piper || vits_meta_data_.is_icefall) { for (const auto &p : phonemes) { - phoneme_ids = PiperPhonemesToIds(token2id_, p); + phoneme_ids = PiperPhonemesToIdsVits(token2id_, p); ans.emplace_back(std::move(phoneme_ids)); } - } else if (meta_data_.is_coqui) { + } else if (vits_meta_data_.is_coqui) { for (const auto &p : phonemes) { - phoneme_ids = CoquiPhonemesToIds(token2id_, p, meta_data_); + phoneme_ids = CoquiPhonemesToIds(token2id_, p, vits_meta_data_); ans.emplace_back(std::move(phoneme_ids)); } @@ -260,13 +358,18 @@ std::vector PiperPhonemizeLexicon::ConvertTextToTokenIds( #if __ANDROID_API__ >= 9 template PiperPhonemizeLexicon::PiperPhonemizeLexicon( AAssetManager *mgr, const std::string &tokens, const std::string &data_dir, - const OfflineTtsVitsModelMetaData &meta_data); + const OfflineTtsVitsModelMetaData &vits_meta_data); + +template PiperPhonemizeLexicon::PiperPhonemizeLexicon( + AAssetManager *mgr, const std::string &tokens, const std::string &data_dir, + const OfflineTtsMatchaModelMetaData &matcha_meta_data); #endif #if __OHOS__ template PiperPhonemizeLexicon::PiperPhonemizeLexicon( NativeResourceManager *mgr, const std::string &tokens, - const std::string &data_dir, const OfflineTtsVitsModelMetaData &meta_data); + const std::string &data_dir, + const OfflineTtsMatchaModelMetaData &matcha_meta_data); #endif } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/piper-phonemize-lexicon.h b/sherpa-onnx/csrc/piper-phonemize-lexicon.h index ccd790a96..f703f0b87 100644 --- a/sherpa-onnx/csrc/piper-phonemize-lexicon.h +++ b/sherpa-onnx/csrc/piper-phonemize-lexicon.h @@ -10,6 +10,7 @@ #include #include "sherpa-onnx/csrc/offline-tts-frontend.h" +#include "sherpa-onnx/csrc/offline-tts-matcha-model-metadata.h" #include "sherpa-onnx/csrc/offline-tts-vits-model-metadata.h" namespace sherpa_onnx { @@ -17,20 +18,37 @@ namespace sherpa_onnx { class PiperPhonemizeLexicon : public OfflineTtsFrontend { public: PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, - const OfflineTtsVitsModelMetaData &meta_data); + const OfflineTtsVitsModelMetaData &vits_meta_data); + + PiperPhonemizeLexicon(const std::string &tokens, const std::string &data_dir, + const OfflineTtsMatchaModelMetaData &matcha_meta_data); template PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, const std::string &data_dir, - const OfflineTtsVitsModelMetaData &meta_data); + const OfflineTtsVitsModelMetaData &vits_meta_data); + + template + PiperPhonemizeLexicon(Manager *mgr, const std::string &tokens, + const std::string &data_dir, + const OfflineTtsMatchaModelMetaData &matcha_meta_data); std::vector ConvertTextToTokenIds( const std::string &text, const std::string &voice = "") const override; + private: + std::vector ConvertTextToTokenIdsVits( + const std::string &text, const std::string &voice = "") const; + + std::vector ConvertTextToTokenIdsMatcha( + const std::string &text, const std::string &voice = "") const; + private: // map unicode codepoint to an integer ID std::unordered_map token2id_; - OfflineTtsVitsModelMetaData meta_data_; + OfflineTtsVitsModelMetaData vits_meta_data_; + OfflineTtsMatchaModelMetaData matcha_meta_data_; + bool is_matcha_ = false; }; } // namespace sherpa_onnx