open-mmlab
diff --git a/‎models/tts/ints/README.md‎
Lines changed: 56 additions & 0 deletions b/‎models/tts/ints/README.md‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎models/tts/ints/chat_template.py‎
Lines changed: 47 additions & 0 deletions b/‎models/tts/ints/chat_template.py‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎models/tts/ints/env.sh‎
Lines changed: 10 additions & 0 deletions b/‎models/tts/ints/env.sh‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎models/tts/ints/gradio_app.py‎
Lines changed: 117 additions & 0 deletions b/‎models/tts/ints/gradio_app.py‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎models/tts/ints/ints.json‎
Lines changed: 60 additions & 0 deletions b/‎models/tts/ints/ints.json‎
Lines changed: 60 additions & 0 deletions
@@ -0,0 +1,56 @@
+# Ints
+
+## Overview
+
+Ints is a text-to-speech model that can generate speech from text.
+
+## Clone and Environment
+
+This part, follow the steps below to clone the repository and install the environment.
+
+```bash
+git clone https://github.com/open-mmlab/Amphion.git
+
+# enter the repositry directory
+cd Amphion
+```
+
+Now, create a new conda environment and activate it.
+
+```bash
+conda create -n ints python=3.10
+conda activate ints
+```
+
+Then, install the dependencies.
+
+```bash
+bash models/tts/ints/env.sh
+```
+
+## Run Gradio 🤗 Playground Locally
+
+You can run the following command to interact with the playground:
+
+```bash
+python -m models.tts.ints.gradio_app --port 7860 --use_vllm
+```
+
+## Citations
+
+If you use Ints in your research, please cite the following paper:
+
+```bibtex
+@article{amphion_v0.2,
+  title        = {Overview of the Amphion Toolkit (v0.2)},
+  author       = {Jiaqi Li and Xueyao Zhang and Yuancheng Wang and Haorui He and Chaoren Wang and Li Wang and Huan Liao and Junyi Ao and Zeyu Xie and Yiqiao Huang and Junan Zhang and Zhizheng Wu},
+  year         = {2025},
+  journal      = {arXiv preprint arXiv:2501.15442},
+}
+@inproceedings{amphion,
+  author={Zhang, Xueyao and Xue, Liumeng and Gu, Yicheng and Wang, Yuancheng and Li, Jiaqi and He, Haorui and Wang, Chaoren and Song, Ting and Chen, Xi and Fang, Zihao and Chen, Haopeng and Zhang, Junan and Tang, Tze Ying and Zou, Lexiao and Wang, Mingxuan and Han, Jun and Chen, Kai and Li, Haizhou and Wu, Zhizheng},
+  title={Amphion: An Open-Source Audio, Music and Speech Generation Toolkit},
+  booktitle={{IEEE} Spoken Language Technology Workshop, {SLT} 2024},
+  year={2024}
+}
+```
@@ -0,0 +1,47 @@
+# Copyright (c) 2025 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+def format_chat_prompt_phi3(messages, add_assistant_token=True):
+    """
+    Convert the messages list into the phi-3 chat template format.
+
+    Args:
+        messages: A list of messages containing role and content.
+
+    Returns:
+        str: The formatted prompt string.
+    """
+    prompt = ""
+    for msg in messages:
+        role = msg["role"]
+        content = msg["content"]
+        # Add corresponding tags for system and user messages
+        if role in ["system", "user"]:
+            prompt += f"<|{role}|>\n{content}<|end|>\n"
+        # For assistant messages, add only the start tag if it's the last one
+        elif role == "assistant" and msg != messages[-1]:
+            prompt += f"<|{role}|>\n{content}<|end|>\n"
+        elif role == "assistant" and msg == messages[-1]:
+            prompt += f"<|{role}|>\n{content}"
+
+    # If the last message is not from the assistant, add the assistant tag
+    if messages[-1]["role"] != "assistant" and add_assistant_token:
+        prompt += "<|assistant|>"
+    return prompt
+
+
+def gen_chat_prompt_for_tts(text):
+    template = [
+        {
+            "role": "system",
+            "content": "You are a powerful AI assistant for speech understanding and generation.",
+        },
+        {
+            "role": "user",
+            "content": f"Please speak the following text out loud: {text}",
+        },
+    ]
+    return format_chat_prompt_phi3(template)
@@ -0,0 +1,10 @@
+#!/bin/bash
+conda activate ints
+
+pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
+pip install git+https://github.com/descriptinc/audiotools
+pip install vllm==0.7.2 soundfile descript-audio-codec easydict pyworld librosa ffmpy importlib-resources json5 ruamel_yaml ipywidgets dualcodec
+pip install gradio accelerate transformers==4.47.1
+
+# optional:
+pip install flash-attn --no-build-isolation
@@ -0,0 +1,117 @@
+# Copyright (c) 2025 Amphion.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import os
+import gradio as gr
+from huggingface_hub import snapshot_download
+from models.tts.ints.ints import Ints
+from utils.util import load_config
+
+
+def text_to_speech(
+    prompt_text, text, prompt_audio, top_k=20, top_p=0.98, temperature=1.0
+):
+    gen_audio, debug_dict = ins_model(
+        text, prompt_audio, prompt_text, top_k, top_p, temperature
+    )
+    # (sample rate in Hz, audio data as numpy array)
+    output_audio = (24000, gen_audio)
+    return output_audio, debug_dict
+
+
+def create_demo():
+    with gr.Blocks() as demo:
+        gr.Markdown("# Ints Text-to-Speech")
+
+        with gr.Row():
+            with gr.Column():
+                prompt_text = gr.Textbox(
+                    label="Prompt Text",
+                    placeholder="Enter the prompt text here...",
+                    lines=2,
+                )
+                text = gr.Textbox(
+                    label="Text to Synthesize",
+                    placeholder="Enter the text you want to convert to speech...",
+                    lines=3,
+                )
+                prompt_audio = gr.Audio(label="Prompt Audio", type="filepath")
+
+                with gr.Row():
+                    top_k = gr.Slider(
+                        minimum=10, maximum=100, value=20, step=1, label="Top-k"
+                    )
+                    top_p = gr.Slider(
+                        minimum=0.5, maximum=1.0, value=0.98, step=0.01, label="Top-p"
+                    )
+                    temperature = gr.Slider(
+                        minimum=0.5,
+                        maximum=1.5,
+                        value=1.0,
+                        step=0.01,
+                        label="Temperature",
+                    )
+
+                generate_btn = gr.Button("Generate Speech")
+
+            with gr.Column():
+                output_audio = gr.Audio(label="Generated Speech")
+                debug_dict = gr.JSON(label="Debug Dict")
+
+        generate_btn.click(
+            fn=text_to_speech,
+            inputs=[
+                prompt_text,
+                text,
+                prompt_audio,
+                top_k,
+                top_p,
+                temperature,
+            ],
+            outputs=[output_audio, debug_dict],
+        )
+
+    return demo
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--port", type=int, default=7860)
+    parser.add_argument("--server_name", type=str, default="0.0.0.0")
+    parser.add_argument("--use_vllm", action="store_true")
+    parser.add_argument("--use_flash_attn", action="store_true")
+    parser.add_argument("--device", type=str, default="cuda:0")
+    parser.add_argument("--config_path", type=str, default="models/tts/ints/ints.json")
+    parser.add_argument("--model_name", type=str, default="ints_v2")
+    parser.add_argument("--gpu_memory_utilization", type=float, default=0.4)
+    args = parser.parse_args()
+
+    ins_cfg = load_config(args.config_path)
+
+    base_folder = snapshot_download("amphion/Ints")
+    llm_path = os.path.join(base_folder, args.model_name)
+    print(f"llm_path: {llm_path}")
+
+    ins_model = Ints(
+        llm_path=llm_path,
+        cfg=ins_cfg,
+        device=args.device,
+        use_vllm=args.use_vllm,
+        use_flash_attn=args.use_flash_attn,
+        gpu_memory_utilization=args.gpu_memory_utilization,
+    )
+
+    text_to_speech(
+        prompt_text="We do not break. We never give in. We never back down.",
+        prompt_audio="models/tts/maskgct/wav/prompt.wav",
+        text="I will make America great again.",
+    )
+
+    demo = create_demo()
+    demo.launch(
+        server_name=args.server_name,
+        server_port=args.port,
+    )
@@ -0,0 +1,60 @@
+{
+    "preprocess": {
+        // tokenizer
+        "min_dur": 1,
+        "max_dur": 30,
+        "pad_token_id": 32000,
+        "end_token_id": 32007,
+        "bos_audio_token_id": 32064,
+        "eos_audio_token_id": 32065,
+        "audio_token_shift": 32066,
+        // melspec
+        "hop_size": 480,
+        "sample_rate": 24000,
+        "n_fft": 1920,
+        "num_mels": 128,
+        "win_size": 1920,
+        "fmin": 0,
+        "fmax": 12000,
+        "mel_var": 8.14,
+        "mel_mean": -4.92
+    },
+    "model": {
+        // voicebox
+        "voicebox": {
+            "mel_dim": 128,
+            "hidden_size": 1024,
+            "num_layers": 16,
+            "num_heads": 16,
+            "cfg_scale": 0.2,
+            "use_cond_code": true,
+            "cond_codebook_size": 16384,
+            "cond_dim": 1024,
+            "cond_scale_factor": 4,
+            "use_pretrained_model": false,
+            "sigma": 1e-5,
+            "time_scheduler": "cos"
+        },
+        // kmeans
+        "kmeans": {
+            "type": "dual_codec"
+        },
+        // dual codec
+        "dual_codec": {
+            "n_codebooks": 7,
+            "codebook_size": 4096,
+            "semantic_codebook_size": 16384,
+            "is_causal": true,
+            "semantic_downsample_factor": 4
+        },
+        "vocos": {
+            "input_channels": 128,
+            "dim": 1024,
+            "intermediate_dim": 4096,
+            "num_layers": 30,
+            "n_fft": 1920,
+            "hop_size": 480,
+            "padding": "same"
+        }
+    }
+}