diff --git a/audio/tts_samples/bstc_s2st/3063_36_raw.wav b/audio/tts_samples/bstc_s2st/3063_36_raw.wav new file mode 100644 index 0000000..90c2762 Binary files /dev/null and b/audio/tts_samples/bstc_s2st/3063_36_raw.wav differ diff --git a/audio/tts_samples/bstc_s2st/3063_36_s2st.wav b/audio/tts_samples/bstc_s2st/3063_36_s2st.wav new file mode 100644 index 0000000..b79cc90 Binary files /dev/null and b/audio/tts_samples/bstc_s2st/3063_36_s2st.wav differ diff --git a/audio/tts_samples/bstc_s2st/3913_33_raw.wav b/audio/tts_samples/bstc_s2st/3913_33_raw.wav new file mode 100644 index 0000000..16a4112 Binary files /dev/null and b/audio/tts_samples/bstc_s2st/3913_33_raw.wav differ diff --git a/audio/tts_samples/bstc_s2st/3913_33_s2st.wav b/audio/tts_samples/bstc_s2st/3913_33_s2st.wav new file mode 100644 index 0000000..2a96141 Binary files /dev/null and b/audio/tts_samples/bstc_s2st/3913_33_s2st.wav differ diff --git a/audio/tts_samples/covost_s2st/common_voice_en_649346_raw.wav b/audio/tts_samples/covost_s2st/common_voice_en_649346_raw.wav new file mode 100644 index 0000000..5ff3b30 Binary files /dev/null and b/audio/tts_samples/covost_s2st/common_voice_en_649346_raw.wav differ diff --git a/audio/tts_samples/covost_s2st/common_voice_en_649346_s2st.wav b/audio/tts_samples/covost_s2st/common_voice_en_649346_s2st.wav new file mode 100644 index 0000000..fc9a4ba Binary files /dev/null and b/audio/tts_samples/covost_s2st/common_voice_en_649346_s2st.wav differ diff --git a/audio/tts_samples/covost_s2st/common_voice_en_691448_raw.wav b/audio/tts_samples/covost_s2st/common_voice_en_691448_raw.wav new file mode 100644 index 0000000..821fd44 Binary files /dev/null and b/audio/tts_samples/covost_s2st/common_voice_en_691448_raw.wav differ diff --git a/audio/tts_samples/covost_s2st/common_voice_en_691448_s2st.wav b/audio/tts_samples/covost_s2st/common_voice_en_691448_s2st.wav new file mode 100644 index 0000000..eeeff4f Binary files /dev/null and b/audio/tts_samples/covost_s2st/common_voice_en_691448_s2st.wav differ diff --git a/audio/tts_samples/covost_s2st/common_voice_en_700678_raw.wav b/audio/tts_samples/covost_s2st/common_voice_en_700678_raw.wav new file mode 100644 index 0000000..b51748f Binary files /dev/null and b/audio/tts_samples/covost_s2st/common_voice_en_700678_raw.wav differ diff --git a/audio/tts_samples/covost_s2st/common_voice_en_700678_s2st.wav b/audio/tts_samples/covost_s2st/common_voice_en_700678_s2st.wav new file mode 100644 index 0000000..6feb9a4 Binary files /dev/null and b/audio/tts_samples/covost_s2st/common_voice_en_700678_s2st.wav differ diff --git a/audio/tts_samples/pS0764_BAC009S0764W0169/S0764_BAC009S0764W0285.wav b/audio/tts_samples/pS0764_BAC009S0764W0169/S0764_BAC009S0764W0285.wav new file mode 100644 index 0000000..11e8541 Binary files /dev/null and b/audio/tts_samples/pS0764_BAC009S0764W0169/S0764_BAC009S0764W0285.wav differ diff --git a/audio/tts_samples/pS0764_BAC009S0764W0169/lauraGPT_pS0764_BAC009S0764W0169.wav b/audio/tts_samples/pS0764_BAC009S0764W0169/lauraGPT_pS0764_BAC009S0764W0169.wav new file mode 100644 index 0000000..a6fffe6 Binary files /dev/null and b/audio/tts_samples/pS0764_BAC009S0764W0169/lauraGPT_pS0764_BAC009S0764W0169.wav differ diff --git a/audio/tts_samples/pS0764_BAC009S0764W0169/prompt_pS0764_BAC009S0764W0169.wav b/audio/tts_samples/pS0764_BAC009S0764W0169/prompt_pS0764_BAC009S0764W0169.wav new file mode 100644 index 0000000..2a8b923 Binary files /dev/null and b/audio/tts_samples/pS0764_BAC009S0764W0169/prompt_pS0764_BAC009S0764W0169.wav differ diff --git a/audio/tts_samples/pS0764_BAC009S0764W0169/valle_phn_pS0764_BAC009S0764W0169.wav b/audio/tts_samples/pS0764_BAC009S0764W0169/valle_phn_pS0764_BAC009S0764W0169.wav new file mode 100644 index 0000000..3181631 Binary files /dev/null and b/audio/tts_samples/pS0764_BAC009S0764W0169/valle_phn_pS0764_BAC009S0764W0169.wav differ diff --git a/audio/tts_samples/pS0764_BAC009S0764W0169/valle_token_pS0764_BAC009S0764W0169.wav b/audio/tts_samples/pS0764_BAC009S0764W0169/valle_token_pS0764_BAC009S0764W0169.wav new file mode 100644 index 0000000..b730cda Binary files /dev/null and b/audio/tts_samples/pS0764_BAC009S0764W0169/valle_token_pS0764_BAC009S0764W0169.wav differ diff --git a/audio/tts_samples/pS0766_BAC009S0766W0321/S0766_BAC009S0766W0182.wav b/audio/tts_samples/pS0766_BAC009S0766W0321/S0766_BAC009S0766W0182.wav new file mode 100644 index 0000000..fccd831 Binary files /dev/null and b/audio/tts_samples/pS0766_BAC009S0766W0321/S0766_BAC009S0766W0182.wav differ diff --git a/audio/tts_samples/pS0766_BAC009S0766W0321/lauraGPT_pS0766_BAC009S0766W0321.wav b/audio/tts_samples/pS0766_BAC009S0766W0321/lauraGPT_pS0766_BAC009S0766W0321.wav new file mode 100644 index 0000000..404fb8b Binary files /dev/null and b/audio/tts_samples/pS0766_BAC009S0766W0321/lauraGPT_pS0766_BAC009S0766W0321.wav differ diff --git a/audio/tts_samples/pS0766_BAC009S0766W0321/prompt_pS0766_BAC009S0766W0321.wav b/audio/tts_samples/pS0766_BAC009S0766W0321/prompt_pS0766_BAC009S0766W0321.wav new file mode 100644 index 0000000..5e395a2 Binary files /dev/null and b/audio/tts_samples/pS0766_BAC009S0766W0321/prompt_pS0766_BAC009S0766W0321.wav differ diff --git a/audio/tts_samples/pS0766_BAC009S0766W0321/valle_phn_pS0766_BAC009S0766W0321.wav b/audio/tts_samples/pS0766_BAC009S0766W0321/valle_phn_pS0766_BAC009S0766W0321.wav new file mode 100644 index 0000000..c8f459a Binary files /dev/null and b/audio/tts_samples/pS0766_BAC009S0766W0321/valle_phn_pS0766_BAC009S0766W0321.wav differ diff --git a/audio/tts_samples/pS0766_BAC009S0766W0321/valle_token_pS0766_BAC009S0766W0321.wav b/audio/tts_samples/pS0766_BAC009S0766W0321/valle_token_pS0766_BAC009S0766W0321.wav new file mode 100644 index 0000000..6c1046a Binary files /dev/null and b/audio/tts_samples/pS0766_BAC009S0766W0321/valle_token_pS0766_BAC009S0766W0321.wav differ diff --git a/audio/tts_samples/pS0906_BAC009S0906W0202/S0906_BAC009S0906W0181.wav b/audio/tts_samples/pS0906_BAC009S0906W0202/S0906_BAC009S0906W0181.wav new file mode 100644 index 0000000..ae80be7 Binary files /dev/null and b/audio/tts_samples/pS0906_BAC009S0906W0202/S0906_BAC009S0906W0181.wav differ diff --git a/audio/tts_samples/pS0906_BAC009S0906W0202/lauraGPT_pS0906_BAC009S0906W0202.wav b/audio/tts_samples/pS0906_BAC009S0906W0202/lauraGPT_pS0906_BAC009S0906W0202.wav new file mode 100644 index 0000000..60d3d30 Binary files /dev/null and b/audio/tts_samples/pS0906_BAC009S0906W0202/lauraGPT_pS0906_BAC009S0906W0202.wav differ diff --git a/audio/tts_samples/pS0906_BAC009S0906W0202/prompt_pS0906_BAC009S0906W0202.wav b/audio/tts_samples/pS0906_BAC009S0906W0202/prompt_pS0906_BAC009S0906W0202.wav new file mode 100644 index 0000000..69b607f Binary files /dev/null and b/audio/tts_samples/pS0906_BAC009S0906W0202/prompt_pS0906_BAC009S0906W0202.wav differ diff --git a/audio/tts_samples/pS0906_BAC009S0906W0202/valle_phn_pS0906_BAC009S0906W0202.wav b/audio/tts_samples/pS0906_BAC009S0906W0202/valle_phn_pS0906_BAC009S0906W0202.wav new file mode 100644 index 0000000..dd702b8 Binary files /dev/null and b/audio/tts_samples/pS0906_BAC009S0906W0202/valle_phn_pS0906_BAC009S0906W0202.wav differ diff --git a/audio/tts_samples/pS0906_BAC009S0906W0202/valle_token_pS0906_BAC009S0906W0202.wav b/audio/tts_samples/pS0906_BAC009S0906W0202/valle_token_pS0906_BAC009S0906W0202.wav new file mode 100644 index 0000000..48d835b Binary files /dev/null and b/audio/tts_samples/pS0906_BAC009S0906W0202/valle_token_pS0906_BAC009S0906W0202.wav differ diff --git a/audio/tts_samples/pS0908_BAC009S0908W0473/S0908_BAC009S0908W0361.wav b/audio/tts_samples/pS0908_BAC009S0908W0473/S0908_BAC009S0908W0361.wav new file mode 100644 index 0000000..1962eeb Binary files /dev/null and b/audio/tts_samples/pS0908_BAC009S0908W0473/S0908_BAC009S0908W0361.wav differ diff --git a/audio/tts_samples/pS0908_BAC009S0908W0473/lauraGPT_pS0908_BAC009S0908W0473.wav b/audio/tts_samples/pS0908_BAC009S0908W0473/lauraGPT_pS0908_BAC009S0908W0473.wav new file mode 100644 index 0000000..76456aa Binary files /dev/null and b/audio/tts_samples/pS0908_BAC009S0908W0473/lauraGPT_pS0908_BAC009S0908W0473.wav differ diff --git a/audio/tts_samples/pS0908_BAC009S0908W0473/prompt_pS0908_BAC009S0908W0473.wav b/audio/tts_samples/pS0908_BAC009S0908W0473/prompt_pS0908_BAC009S0908W0473.wav new file mode 100644 index 0000000..45b0b90 Binary files /dev/null and b/audio/tts_samples/pS0908_BAC009S0908W0473/prompt_pS0908_BAC009S0908W0473.wav differ diff --git a/audio/tts_samples/pS0908_BAC009S0908W0473/valle_phn_pS0908_BAC009S0908W0473.wav b/audio/tts_samples/pS0908_BAC009S0908W0473/valle_phn_pS0908_BAC009S0908W0473.wav new file mode 100644 index 0000000..856533d Binary files /dev/null and b/audio/tts_samples/pS0908_BAC009S0908W0473/valle_phn_pS0908_BAC009S0908W0473.wav differ diff --git a/audio/tts_samples/pS0908_BAC009S0908W0473/valle_token_pS0908_BAC009S0908W0473.wav b/audio/tts_samples/pS0908_BAC009S0908W0473/valle_token_pS0908_BAC009S0908W0473.wav new file mode 100644 index 0000000..231dc79 Binary files /dev/null and b/audio/tts_samples/pS0908_BAC009S0908W0473/valle_token_pS0908_BAC009S0908W0473.wav differ diff --git a/index.html b/index.html index c0de4f8..4fe6ded 100644 --- a/index.html +++ b/index.html @@ -125,6 +125,23 @@

TTS

+
+
+
+
+

S2ST

+ + +
+
+
+
+ +
@@ -330,197 +347,516 @@

1. Automatic speech recognition (ASR) samples

2. Text-to-speech synthesis (TTS) samples


-
[Prompt: 1995_1837_000020_000000] Up in the sick room Zora lay on the little white bed. [Continuation: 1995_1836_000003_000002] At last the Cotton Combine was to all appearances an assured fact and he was slated for the Senate.
-
- - - - - - - - - - - - - - - - - - - - - - - - -
-
Prompt wav (16k)Ground-truth (16k)VALLE-PhoneVALLE-TokenLauraGPT
- - - - - - - - - -
-
[Prompt: 2830_3980_000018_000001] Humble man that he was, he will not now take a back seat. [Continuation: 2830_3980_000018_000000] Against these boasting, false apostles, Paul boldly defends his apostolic authority and ministry.
- - - - - - - - - - - - - - - - - - - - - - - - -
-
Prompt wav (16k)Ground-truth (16k)VALLE-PhoneVALLE-TokenLauraGPT
- - - - - - - - - -
-
[Prompt: 6829_68771_000046_000000] A sudden wave of scarlet swept over Eliza's face. [Continuation: 6829_68769_000030_000000] Then he deliberately locked Kenneth and Beth in with the forger, and retreated along the passage.
- - - - - - - - - - - - - - - +
2.1 LibriTTS Zero Shot TTS
+
+
[Prompt: 1995_1837_000020_000000] Up in the sick room Zora lay on the little white bed. [Continuation: 1995_1836_000003_000002] At last the Cotton Combine was to all appearances an assured fact and he was slated for the Senate.
+
+
Prompt wav (16k)Ground-truth (16k)VALLE-PhoneVALLE-TokenLauraGPT
- - - -
+ + + + + + + + + + + + + + + + + + + + + + + +
+
Prompt wav (16k)Ground-truth (16k)VALL-E PhoneVALL-E TokenLauraGPT
+ + + + + + + + + +
+
[Prompt: 2830_3980_000018_000001] Humble man that he was, he will not now take a back seat. [Continuation: 2830_3980_000018_000000] Against these boasting, false apostles, Paul boldly defends his apostolic authority and ministry.
+ + + + + + + + + + + + + + + + + + + + + + + + +
+
Prompt wav (16k)Ground-truth (16k)VALL-E PhoneVALL-E TokenLauraGPT
+ + + + + + + + + +
+
[Prompt: 6829_68771_000046_000000] A sudden wave of scarlet swept over Eliza's face. [Continuation: 6829_68769_000030_000000] Then he deliberately locked Kenneth and Beth in with the forger, and retreated along the passage.
+ + + + + + + + + + + + + + + + + + + + + + + + +
+
Prompt wav (16k)Ground-truth (16k)VALL-E PhoneVALL-E TokenLauraGPT
+ + + + + + + + + +
+
[Prompt: 8230_279154_000004_000008] To deal with this problem, we must have a theory of memory. [Continuation: 8230_279154_000019_000000] The first of our vague but indubitable data is that there is knowledge of the past.
+ + + + + + + + + + + + + + + + + + + + + + + +
2.2 AISHELL Zero Shot TTS
+
+
[Prompt: S0764_BAC009S0764W0169] 实施较大幅度的补贴政策 [Continuation: S0764_BAC009S0764W0285] 两家公司是联网汽车的主要芯片供应商。
+
+
Prompt wav (16k)Ground-truth (16k)VALL-E PhoneVALL-E TokenLauraGPT
+ + + + + + + + + +
+ + + + + + + + + + + + + + + + + + + + + + + +
+
Prompt wav (16k)Ground-truth (16k)VALL-E PhoneVALL-E TokenLauraGPT
+ + + + + + + + + +
+
[Prompt: S0906_BAC009S0906W0202] 加强农牧互补牧养结合 [Continuation: S0906_BAC009S0906W0181] 月度市场成交量开始出现环比回升。
+ + + + + + + + + + + + + + + + + + + + + + + + +
+
Prompt wav (16k)Ground-truth (16k)VALL-E PhoneVALL-E TokenLauraGPT
+ + + + + + + + + +
+
[Prompt: S0766_BAC009S0766W0321] 新能源汽车市场在逐步启动 [Continuation: S0766_BAC009S0766W0182] 转型后的今久整合营销集团。
+ + + + + + + + + + + + + + + + + + + + + + + + +
+
Prompt wav (16k)Ground-truth (16k)VALL-E PhoneVALL-E TokenLauraGPT
+ + + + + + + + + +
+
[Prompt: S0908_BAC009S0908W0473] 参考消息网七月八日报道 [Continuation: S0908_BAC009S0908W0361] 帮助仰泳运动员改善自己的出发技术。
+ + + + + + + + + + + + + + + + + + + + + + - - + +
Prompt wav (16k)Ground-truth (16k)VALL-E PhoneVALL-E TokenLauraGPT
+ + + + + + + + + +
- - - -
+
- - - - - -
- -
[Prompt: 8230_279154_000004_000008] To deal with this problem, we must have a theory of memory. [Continuation: 8230_279154_000019_000000] The first of our vague but indubitable data is that there is knowledge of the past.
+

3. Zero-shot speech-to-speech translation (S2ST) samples

+
- - - - - - - - - - - - + +
+
Prompt wav (16k)Ground-truth (16k)VALLE-PhoneVALLE-TokenLauraGPT
- -
+
3.1 English-to-Chinese Translation using CoVOST2 dataset
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - - + +
+
English TextEnglish SpeechLauraGPT Translated Speech
+ two workers in orange vests perform their job. + + + + +
+ two boys are playing soccer in the water at the beach. + + + + +
+ many programming languages are named after real people. + + + + +
- - - -
+
3.2 Chinese-to-English Translation using BSTC dataset
+ + + + + + + + + + + + + + + + + + + + + + + + - - -
Chinese TextChinese SpeechLauraGPT Translated Speech
+ 但不是这种所有的可能性都可以在市场上成功的。 + + + + +
+ 要知道每个人都是怕输的,对吗? + + + + +
- - - -
-

3. Speech to text translation (S2TT) samples

+ +

4. Speech to text translation (S2TT) samples


@@ -614,7 +950,7 @@

3. Speech to text translation (S2TT) samples

-

4. Speech enhancement (SE) samples

+

5. Speech enhancement (SE) samples


@@ -921,9 +1257,9 @@

Results

- - - + + + @@ -932,25 +1268,49 @@

Results

- - - - - + + + + + + + + + + + + + + + + - - - - + + + + + + + + + + + + + + + - - - - + + + + + + @@ -966,37 +1326,60 @@

Results

Models / DatasetsModelsModels sizeData size AISHELL(test) AISHELL-2(test-ios) LibriSpeech(test-clean)
Paraformer Large1.952.85//Paraformer (CN)0.2 B60K2.02.9--
Paraformer (EN)0.2 B20K--3.58.2
Whisper Large V2//2.705.241.5 B680K5.75.52.75.2
Discrete IO1.8 B22K7.18.69.124.0
LauraGPT1.763.154.527.862.0 B22K1.83.24.47.7
- - - - - + + + + + + + + + + + + + + + - - - - - + + + + + + + - - - - - + + + + + + + - - - - - + + + + + + + + + + + + + + + + - -
Models / DatasetsAISHELL(test)AISHELL-2(test-ios)LibriSpeech(test-clean)LibriSpeech(test-others)ModelsAISHELLLibriTTS
- CER ↓SECS ↑MOSNet ↑CER ↓SECS ↑MOSNet ↑
Paraformer Large1.952.85//Origin1.700.923.272.900.943.35
Whisper Large V2//2.705.24VALL-E Phone4.750.913.224.300.923.28
LauraGPT1.763.154.527.86VALL-E Token6.520.913.196.570.933.28
LauraGPT (Ours)6.910.903.148.620.913.26
@@ -1010,17 +1393,30 @@

Results

Models - BSTC dev + Zh→En + En→Zh SMLTA ASR + pre-trained MT (Zhang et al., 2021) 18.22 + - - LauraGPT - 17.75 + Wang et al. (2020) + - + 25.4 + + + Discrete IO + 5.1 + 5.0 + + + LauraGPT (Ours) + 17.8 + 38.5 @@ -1037,28 +1433,53 @@

Results

Models - PESQ - STOI - WER + PESQ ↑ + STOI ↑ + CER ↓ + WER ↓ - Raw + Clean + 4.50 + 100.0 + 3.31 + 7.55 + + + Clean_codec_syn + 2.72 + 87.0 + 7.46 + 14.28 + + + Noisy 2.34 - 85.00 + 85.0 + 13.81 23.00 CMGAN - 3.16 - 92.50 - 11.63 + 2.95 + 91.0 + 6.42 + 12.29 + + + Discrete IO + 1.96 + 64.0 + 40.91 + 53.97 LauraGPT 2.97 - 87.64 + 88.0 + 9.05 15.94 @@ -1076,47 +1497,40 @@

Results

Models - BLEU-4 - SPIDEr - CIDEr - SPICE + BLEU-4 ↑ + SPICE ↑ + CIDEr ↑ + SPIDEr ↑ Oracle 1.00 - 1.54 - 2.64 0.43 + 2.64 + 1.54 - GRU (Drossos et al., 2017) - - - - - 0.18 - - - - - Baseline + Drossos et al. (2020) (EncDec-Attn) 0.02 - 0.10 - - Ensemble + Koizumi et al. (2020) (Ensemble) - - 0.207 - 0.319 - 0.094 + 0.09 + 0.32 + 0.21 LauraGPT 0.08 - 0.08 - 0.22 0.15 + 0.22 + 0.08 @@ -1125,7 +1539,7 @@

Results

- 6. Evaluations on speech enhancement (SE) task + 6. Evaluations on speech emotion recognition (SER) task

@@ -1140,19 +1554,25 @@

Results

- WavLM Base + Chen et al. (2023) WavLM Base 0.499 0.201 0.400 - WavLM Large + Chen et al. (2023) WavLM Large 0.542 0.253 0.476 - LauraGPT + Chen et al. (2023) Vesper-12 + 0.535 + 0.268 + 0.480 + + + LauraGPT (Ours) 0.507 0.312 0.492 @@ -1182,31 +1602,25 @@

Results

- Direct - 81.73 - 77.11 - 75.05 - 61.24 - 65.42 - 63.26 + Ravanelli et al. (2021) CRDNN + 82.15 + 77.79 + 75.64 + 62.35 + 66.45 + 64.34 - Direct (HuBERT) - 91.24 - 88.47 - 87.54 - 72.93 - 77.40 - 75.10 + Ravanelli et al. (2021) Wav2Vec 2.0 + 89.49 + 86.40 + 85.34 + 72.60 + 76.76 + 74.62 - LauraGPT - 91.04 - 89.07 - 87.87 - - - LauraGPT + LauraGPT (Ours) 91.04 89.07 87.87