Skip to content

Commit cdc2310

Browse files
committed
release: Ints
1 parent 6b1411c commit cdc2310

File tree

9 files changed

+1480
-0
lines changed

9 files changed

+1480
-0
lines changed

models/tts/ints/README.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
# Ints
2+
3+
## Overview
4+
5+
Ints is a text-to-speech model that can generate speech from text.
6+
7+
## Clone and Environment
8+
9+
This part, follow the steps below to clone the repository and install the environment.
10+
11+
```bash
12+
git clone https://github.com/open-mmlab/Amphion.git
13+
14+
# enter the repositry directory
15+
cd Amphion
16+
```
17+
18+
Now, create a new conda environment and activate it.
19+
20+
```bash
21+
conda create -n ints python=3.10
22+
conda activate ints
23+
```
24+
25+
Then, install the dependencies.
26+
27+
```bash
28+
bash models/tts/ints/env.sh
29+
```
30+
31+
## Run Gradio 🤗 Playground Locally
32+
33+
You can run the following command to interact with the playground:
34+
35+
```bash
36+
python -m models.tts.ints.gradio_app --port 7860 --use_vllm
37+
```
38+
39+
## Citations
40+
41+
If you use Ints in your research, please cite the following paper:
42+
43+
```bibtex
44+
@article{amphion_v0.2,
45+
title = {Overview of the Amphion Toolkit (v0.2)},
46+
author = {Jiaqi Li and Xueyao Zhang and Yuancheng Wang and Haorui He and Chaoren Wang and Li Wang and Huan Liao and Junyi Ao and Zeyu Xie and Yiqiao Huang and Junan Zhang and Zhizheng Wu},
47+
year = {2025},
48+
journal = {arXiv preprint arXiv:2501.15442},
49+
}
50+
@inproceedings{amphion,
51+
author={Zhang, Xueyao and Xue, Liumeng and Gu, Yicheng and Wang, Yuancheng and Li, Jiaqi and He, Haorui and Wang, Chaoren and Song, Ting and Chen, Xi and Fang, Zihao and Chen, Haopeng and Zhang, Junan and Tang, Tze Ying and Zou, Lexiao and Wang, Mingxuan and Han, Jun and Chen, Kai and Li, Haizhou and Wu, Zhizheng},
52+
title={Amphion: An Open-Source Audio, Music and Speech Generation Toolkit},
53+
booktitle={{IEEE} Spoken Language Technology Workshop, {SLT} 2024},
54+
year={2024}
55+
}
56+
```

models/tts/ints/chat_template.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Copyright (c) 2025 Amphion.
2+
#
3+
# This source code is licensed under the MIT license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
7+
def format_chat_prompt_phi3(messages, add_assistant_token=True):
8+
"""
9+
Convert the messages list into the phi-3 chat template format.
10+
11+
Args:
12+
messages: A list of messages containing role and content.
13+
14+
Returns:
15+
str: The formatted prompt string.
16+
"""
17+
prompt = ""
18+
for msg in messages:
19+
role = msg["role"]
20+
content = msg["content"]
21+
# Add corresponding tags for system and user messages
22+
if role in ["system", "user"]:
23+
prompt += f"<|{role}|>\n{content}<|end|>\n"
24+
# For assistant messages, add only the start tag if it's the last one
25+
elif role == "assistant" and msg != messages[-1]:
26+
prompt += f"<|{role}|>\n{content}<|end|>\n"
27+
elif role == "assistant" and msg == messages[-1]:
28+
prompt += f"<|{role}|>\n{content}"
29+
30+
# If the last message is not from the assistant, add the assistant tag
31+
if messages[-1]["role"] != "assistant" and add_assistant_token:
32+
prompt += "<|assistant|>"
33+
return prompt
34+
35+
36+
def gen_chat_prompt_for_tts(text):
37+
template = [
38+
{
39+
"role": "system",
40+
"content": "You are a powerful AI assistant for speech understanding and generation.",
41+
},
42+
{
43+
"role": "user",
44+
"content": f"Please speak the following text out loud: {text}",
45+
},
46+
]
47+
return format_chat_prompt_phi3(template)

models/tts/ints/env.sh

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
#!/bin/bash
2+
conda activate ints
3+
4+
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
5+
pip install git+https://github.com/descriptinc/audiotools
6+
pip install vllm==0.7.2 soundfile descript-audio-codec easydict pyworld librosa ffmpy importlib-resources json5 ruamel_yaml ipywidgets dualcodec
7+
pip install gradio accelerate transformers==4.47.1
8+
9+
# optional:
10+
pip install flash-attn --no-build-isolation

models/tts/ints/gradio_app.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
# Copyright (c) 2025 Amphion.
2+
#
3+
# This source code is licensed under the MIT license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
import argparse
7+
import os
8+
import gradio as gr
9+
from huggingface_hub import snapshot_download
10+
from models.tts.ints.ints import Ints
11+
from utils.util import load_config
12+
13+
14+
def text_to_speech(
15+
prompt_text, text, prompt_audio, top_k=20, top_p=0.98, temperature=1.0
16+
):
17+
gen_audio, debug_dict = ins_model(
18+
text, prompt_audio, prompt_text, top_k, top_p, temperature
19+
)
20+
# (sample rate in Hz, audio data as numpy array)
21+
output_audio = (24000, gen_audio)
22+
return output_audio, debug_dict
23+
24+
25+
def create_demo():
26+
with gr.Blocks() as demo:
27+
gr.Markdown("# Ints Text-to-Speech")
28+
29+
with gr.Row():
30+
with gr.Column():
31+
prompt_text = gr.Textbox(
32+
label="Prompt Text",
33+
placeholder="Enter the prompt text here...",
34+
lines=2,
35+
)
36+
text = gr.Textbox(
37+
label="Text to Synthesize",
38+
placeholder="Enter the text you want to convert to speech...",
39+
lines=3,
40+
)
41+
prompt_audio = gr.Audio(label="Prompt Audio", type="filepath")
42+
43+
with gr.Row():
44+
top_k = gr.Slider(
45+
minimum=10, maximum=100, value=20, step=1, label="Top-k"
46+
)
47+
top_p = gr.Slider(
48+
minimum=0.5, maximum=1.0, value=0.98, step=0.01, label="Top-p"
49+
)
50+
temperature = gr.Slider(
51+
minimum=0.5,
52+
maximum=1.5,
53+
value=1.0,
54+
step=0.01,
55+
label="Temperature",
56+
)
57+
58+
generate_btn = gr.Button("Generate Speech")
59+
60+
with gr.Column():
61+
output_audio = gr.Audio(label="Generated Speech")
62+
debug_dict = gr.JSON(label="Debug Dict")
63+
64+
generate_btn.click(
65+
fn=text_to_speech,
66+
inputs=[
67+
prompt_text,
68+
text,
69+
prompt_audio,
70+
top_k,
71+
top_p,
72+
temperature,
73+
],
74+
outputs=[output_audio, debug_dict],
75+
)
76+
77+
return demo
78+
79+
80+
if __name__ == "__main__":
81+
parser = argparse.ArgumentParser()
82+
parser.add_argument("--port", type=int, default=7860)
83+
parser.add_argument("--server_name", type=str, default="0.0.0.0")
84+
parser.add_argument("--use_vllm", action="store_true")
85+
parser.add_argument("--use_flash_attn", action="store_true")
86+
parser.add_argument("--device", type=str, default="cuda:0")
87+
parser.add_argument("--config_path", type=str, default="models/tts/ints/ints.json")
88+
parser.add_argument("--model_name", type=str, default="ints_v2")
89+
parser.add_argument("--gpu_memory_utilization", type=float, default=0.4)
90+
args = parser.parse_args()
91+
92+
ins_cfg = load_config(args.config_path)
93+
94+
base_folder = snapshot_download("amphion/Ints")
95+
llm_path = os.path.join(base_folder, args.model_name)
96+
print(f"llm_path: {llm_path}")
97+
98+
ins_model = Ints(
99+
llm_path=llm_path,
100+
cfg=ins_cfg,
101+
device=args.device,
102+
use_vllm=args.use_vllm,
103+
use_flash_attn=args.use_flash_attn,
104+
gpu_memory_utilization=args.gpu_memory_utilization,
105+
)
106+
107+
text_to_speech(
108+
prompt_text="We do not break. We never give in. We never back down.",
109+
prompt_audio="models/tts/maskgct/wav/prompt.wav",
110+
text="I will make America great again.",
111+
)
112+
113+
demo = create_demo()
114+
demo.launch(
115+
server_name=args.server_name,
116+
server_port=args.port,
117+
)

models/tts/ints/ints.json

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
{
2+
"preprocess": {
3+
// tokenizer
4+
"min_dur": 1,
5+
"max_dur": 30,
6+
"pad_token_id": 32000,
7+
"end_token_id": 32007,
8+
"bos_audio_token_id": 32064,
9+
"eos_audio_token_id": 32065,
10+
"audio_token_shift": 32066,
11+
// melspec
12+
"hop_size": 480,
13+
"sample_rate": 24000,
14+
"n_fft": 1920,
15+
"num_mels": 128,
16+
"win_size": 1920,
17+
"fmin": 0,
18+
"fmax": 12000,
19+
"mel_var": 8.14,
20+
"mel_mean": -4.92
21+
},
22+
"model": {
23+
// voicebox
24+
"voicebox": {
25+
"mel_dim": 128,
26+
"hidden_size": 1024,
27+
"num_layers": 16,
28+
"num_heads": 16,
29+
"cfg_scale": 0.2,
30+
"use_cond_code": true,
31+
"cond_codebook_size": 16384,
32+
"cond_dim": 1024,
33+
"cond_scale_factor": 4,
34+
"use_pretrained_model": false,
35+
"sigma": 1e-5,
36+
"time_scheduler": "cos"
37+
},
38+
// kmeans
39+
"kmeans": {
40+
"type": "dual_codec"
41+
},
42+
// dual codec
43+
"dual_codec": {
44+
"n_codebooks": 7,
45+
"codebook_size": 4096,
46+
"semantic_codebook_size": 16384,
47+
"is_causal": true,
48+
"semantic_downsample_factor": 4
49+
},
50+
"vocos": {
51+
"input_channels": 128,
52+
"dim": 1024,
53+
"intermediate_dim": 4096,
54+
"num_layers": 30,
55+
"n_fft": 1920,
56+
"hop_size": 480,
57+
"padding": "same"
58+
}
59+
}
60+
}

0 commit comments

Comments
 (0)