-
Notifications
You must be signed in to change notification settings - Fork 477
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add Java API for Matcha-TTS models. (#1673)
- Loading branch information
1 parent
f457bae
commit a00d3b4
Showing
11 changed files
with
359 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
// Copyright 2025 Xiaomi Corporation | ||
|
||
// This file shows how to use a matcha English model | ||
// to convert text to speech | ||
import com.k2fsa.sherpa.onnx.*; | ||
|
||
public class NonStreamingTtsMatchaEn { | ||
public static void main(String[] args) { | ||
// please visit | ||
// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | ||
// to download model files | ||
String acousticModel = "./matcha-icefall-en_US-ljspeech/model-steps-3.onnx"; | ||
String vocoder = "./hifigan_v2.onnx"; | ||
String tokens = "./matcha-icefall-en_US-ljspeech/tokens.txt"; | ||
String dataDir = "./matcha-icefall-en_US-ljspeech/espeak-ng-data"; | ||
String text = | ||
"Today as always, men fall into two groups: slaves and free men. Whoever does not have" | ||
+ " two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a" | ||
+ " businessman, an official, or a scholar."; | ||
|
||
OfflineTtsMatchaModelConfig matchaModelConfig = | ||
OfflineTtsMatchaModelConfig.builder() | ||
.setAcousticModel(acousticModel) | ||
.setVocoder(vocoder) | ||
.setTokens(tokens) | ||
.setDataDir(dataDir) | ||
.build(); | ||
|
||
OfflineTtsModelConfig modelConfig = | ||
OfflineTtsModelConfig.builder() | ||
.setMatcha(matchaModelConfig) | ||
.setNumThreads(1) | ||
.setDebug(true) | ||
.build(); | ||
|
||
OfflineTtsConfig config = OfflineTtsConfig.builder().setModel(modelConfig).build(); | ||
OfflineTts tts = new OfflineTts(config); | ||
|
||
int sid = 0; | ||
float speed = 1.0f; | ||
long start = System.currentTimeMillis(); | ||
GeneratedAudio audio = tts.generate(text, sid, speed); | ||
long stop = System.currentTimeMillis(); | ||
|
||
float timeElapsedSeconds = (stop - start) / 1000.0f; | ||
|
||
float audioDuration = audio.getSamples().length / (float) audio.getSampleRate(); | ||
float real_time_factor = timeElapsedSeconds / audioDuration; | ||
|
||
String waveFilename = "tts-matcha-en.wav"; | ||
audio.save(waveFilename); | ||
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); | ||
System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); | ||
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); | ||
System.out.printf("-- text: %s\n", text); | ||
System.out.printf("-- Saved to %s\n", waveFilename); | ||
|
||
tts.release(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
// Copyright 2025 Xiaomi Corporation | ||
|
||
// This file shows how to use a matcha Chinese TTS model | ||
// to convert text to speech | ||
import com.k2fsa.sherpa.onnx.*; | ||
|
||
public class NonStreamingTtsMatchaZh { | ||
public static void main(String[] args) { | ||
// please visit | ||
// https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker | ||
// to download model files | ||
String acousticModel = "./matcha-icefall-zh-baker/model-steps-3.onnx"; | ||
String vocoder = "./hifigan_v2.onnx"; | ||
String tokens = "./matcha-icefall-zh-baker/tokens.txt"; | ||
String lexicon = "./matcha-icefall-zh-baker/lexicon.txt"; | ||
String dictDir = "./matcha-icefall-zh-baker/dict"; | ||
String ruleFsts = | ||
"./matcha-icefall-zh-baker/phone.fst,./matcha-icefall-zh-baker/date.fst,./matcha-icefall-zh-baker/number.fst"; | ||
String text = | ||
"某某银行的副行长和一些行政领导表示,他们去过长江" | ||
+ "和长白山; 经济不断增长。" | ||
+ "2024年12月31号,拨打110或者18920240511。" | ||
+ "123456块钱。"; | ||
|
||
OfflineTtsMatchaModelConfig matchaModelConfig = | ||
OfflineTtsMatchaModelConfig.builder() | ||
.setAcousticModel(acousticModel) | ||
.setVocoder(vocoder) | ||
.setTokens(tokens) | ||
.setLexicon(lexicon) | ||
.setDictDir(dictDir) | ||
.build(); | ||
|
||
OfflineTtsModelConfig modelConfig = | ||
OfflineTtsModelConfig.builder() | ||
.setMatcha(matchaModelConfig) | ||
.setNumThreads(1) | ||
.setDebug(true) | ||
.build(); | ||
|
||
OfflineTtsConfig config = | ||
OfflineTtsConfig.builder().setModel(modelConfig).setRuleFsts(ruleFsts).build(); | ||
OfflineTts tts = new OfflineTts(config); | ||
|
||
int sid = 0; | ||
float speed = 1.0f; | ||
long start = System.currentTimeMillis(); | ||
GeneratedAudio audio = tts.generate(text, sid, speed); | ||
long stop = System.currentTimeMillis(); | ||
|
||
float timeElapsedSeconds = (stop - start) / 1000.0f; | ||
|
||
float audioDuration = audio.getSamples().length / (float) audio.getSampleRate(); | ||
float real_time_factor = timeElapsedSeconds / audioDuration; | ||
|
||
String waveFilename = "tts-matcha-zh.wav"; | ||
audio.save(waveFilename); | ||
System.out.printf("-- elapsed : %.3f seconds\n", timeElapsedSeconds); | ||
System.out.printf("-- audio duration: %.3f seconds\n", timeElapsedSeconds); | ||
System.out.printf("-- real-time factor (RTF): %.3f\n", real_time_factor); | ||
System.out.printf("-- text: %s\n", text); | ||
System.out.printf("-- Saved to %s\n", waveFilename); | ||
|
||
tts.release(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -ex | ||
|
||
if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then | ||
mkdir -p ../build | ||
pushd ../build | ||
cmake \ | ||
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
-DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
-DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
-DBUILD_SHARED_LIBS=ON \ | ||
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
-DSHERPA_ONNX_ENABLE_JNI=ON \ | ||
.. | ||
|
||
make -j4 | ||
ls -lh lib | ||
popd | ||
fi | ||
|
||
if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then | ||
pushd ../sherpa-onnx/java-api | ||
make | ||
popd | ||
fi | ||
|
||
# please visit | ||
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | ||
# matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker | ||
# to download more models | ||
if [ ! -f ./matcha-icefall-en_US-ljspeech/model-steps-3.onnx ]; then | ||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-en_US-ljspeech.tar.bz2 | ||
tar xvf matcha-icefall-en_US-ljspeech.tar.bz2 | ||
rm matcha-icefall-en_US-ljspeech.tar.bz2 | ||
fi | ||
|
||
if [ ! -f ./hifigan_v2.onnx ]; then | ||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
fi | ||
|
||
java \ | ||
-Djava.library.path=$PWD/../build/lib \ | ||
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ | ||
NonStreamingTtsMatchaEn.java |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#!/usr/bin/env bash | ||
|
||
set -ex | ||
|
||
if [[ ! -f ../build/lib/libsherpa-onnx-jni.dylib && ! -f ../build/lib/libsherpa-onnx-jni.so ]]; then | ||
mkdir -p ../build | ||
pushd ../build | ||
cmake \ | ||
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \ | ||
-DSHERPA_ONNX_ENABLE_TESTS=OFF \ | ||
-DSHERPA_ONNX_ENABLE_CHECK=OFF \ | ||
-DBUILD_SHARED_LIBS=ON \ | ||
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \ | ||
-DSHERPA_ONNX_ENABLE_JNI=ON \ | ||
.. | ||
|
||
make -j4 | ||
ls -lh lib | ||
popd | ||
fi | ||
|
||
if [ ! -f ../sherpa-onnx/java-api/build/sherpa-onnx.jar ]; then | ||
pushd ../sherpa-onnx/java-api | ||
make | ||
popd | ||
fi | ||
|
||
# please visit | ||
# https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker | ||
# to download more models | ||
if [ ! -f ./matcha-icefall-zh-baker/model-steps-3.onnx ]; then | ||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2 | ||
tar xvf matcha-icefall-zh-baker.tar.bz2 | ||
rm matcha-icefall-zh-baker.tar.bz2 | ||
fi | ||
|
||
if [ ! -f ./hifigan_v2.onnx ]; then | ||
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/vocoder-models/hifigan_v2.onnx | ||
fi | ||
|
||
java \ | ||
-Djava.library.path=$PWD/../build/lib \ | ||
-cp ../sherpa-onnx/java-api/build/sherpa-onnx.jar \ | ||
NonStreamingTtsMatchaZh.java |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
116 changes: 116 additions & 0 deletions
116
sherpa-onnx/java-api/src/com/k2fsa/sherpa/onnx/OfflineTtsMatchaModelConfig.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
// Copyright 2025 Xiaomi Corporation | ||
|
||
package com.k2fsa.sherpa.onnx; | ||
|
||
public class OfflineTtsMatchaModelConfig { | ||
private final String acousticModel; | ||
private final String vocoder; | ||
private final String lexicon; | ||
private final String tokens; | ||
private final String dataDir; | ||
private final String dictDir; | ||
private final float noiseScale; | ||
private final float lengthScale; | ||
|
||
private OfflineTtsMatchaModelConfig(Builder builder) { | ||
this.acousticModel = builder.acousticModel; | ||
this.vocoder = builder.vocoder; | ||
this.lexicon = builder.lexicon; | ||
this.tokens = builder.tokens; | ||
this.dataDir = builder.dataDir; | ||
this.dictDir = builder.dictDir; | ||
this.noiseScale = builder.noiseScale; | ||
this.lengthScale = builder.lengthScale; | ||
} | ||
|
||
public static Builder builder() { | ||
return new Builder(); | ||
} | ||
|
||
public String getAcousticModel() { | ||
return acousticModel; | ||
} | ||
|
||
public String getVocoder() { | ||
return vocoder; | ||
} | ||
|
||
public String getLexicon() { | ||
return lexicon; | ||
} | ||
|
||
public String getTokens() { | ||
return tokens; | ||
} | ||
|
||
public String getDataDir() { | ||
return dataDir; | ||
} | ||
|
||
public String getDictDir() { | ||
return dictDir; | ||
} | ||
|
||
public float getLengthScale() { | ||
return lengthScale; | ||
} | ||
|
||
public float getNoiseScale() { | ||
return noiseScale; | ||
} | ||
|
||
public static class Builder { | ||
private String acousticModel = ""; | ||
private String vocoder = ""; | ||
private String lexicon = ""; | ||
private String tokens = ""; | ||
private String dataDir = ""; | ||
private String dictDir = ""; | ||
private float noiseScale = 1.0f; | ||
private float lengthScale = 1.0f; | ||
|
||
public OfflineTtsMatchaModelConfig build() { | ||
return new OfflineTtsMatchaModelConfig(this); | ||
} | ||
|
||
public Builder setAcousticModel(String acousticModel) { | ||
this.acousticModel = acousticModel; | ||
return this; | ||
} | ||
|
||
public Builder setVocoder(String vocoder) { | ||
this.vocoder = vocoder; | ||
return this; | ||
} | ||
|
||
public Builder setTokens(String tokens) { | ||
this.tokens = tokens; | ||
return this; | ||
} | ||
|
||
public Builder setLexicon(String lexicon) { | ||
this.lexicon = lexicon; | ||
return this; | ||
} | ||
|
||
public Builder setDataDir(String dataDir) { | ||
this.dataDir = dataDir; | ||
return this; | ||
} | ||
|
||
public Builder setDictDir(String dictDir) { | ||
this.dictDir = dictDir; | ||
return this; | ||
} | ||
|
||
public Builder setNoiseScale(float noiseScale) { | ||
this.noiseScale = noiseScale; | ||
return this; | ||
} | ||
|
||
public Builder setLengthScale(float lengthScale) { | ||
this.lengthScale = lengthScale; | ||
return this; | ||
} | ||
} | ||
} |
Oops, something went wrong.