diff --git a/models/codec/dualcodec/README.md b/models/codec/dualcodec/README.md index a8325c19..94208a33 100644 --- a/models/codec/dualcodec/README.md +++ b/models/codec/dualcodec/README.md @@ -5,8 +5,8 @@ [![arXiv](https://img.shields.io/badge/arXiv-2505.13000-brightgreen.svg?style=flat-square)](http://arxiv.org/abs/2505.13000) [![githubio](https://img.shields.io/badge/GitHub.io-Demo_Page-blue?logo=Github&style=flat-square)](https://dualcodec.github.io/) [![PyPI](https://img.shields.io/pypi/v/dualcodec?color=blue&label=PyPI&logo=PyPI&style=flat-square)](https://pypi.org/project/dualcodec/) -![GitHub](https://img.shields.io/badge/Github-Dev_Release-pink?logo=Github&style=flat-square) -![Amphion](https://img.shields.io/badge/Amphion-Stable_Release-blue?style=flat-square) +[![GitHub](https://img.shields.io/badge/Github-Dev_Release-pink?logo=Github&style=flat-square)](https://github.com/jiaqili3/dualcodec) +[![Amphion](https://img.shields.io/badge/Amphion-Stable_Release-blue?style=flat-square)](https://github.com/open-mmlab/Amphion/blob/main/models/codec/dualcodec/README.md) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1VvUhsDffLdY5TdNuaqlLnYzIoXhvI8MK#scrollTo=Lsos3BK4J-4E) ## About @@ -125,11 +125,11 @@ This will launch an app that allows you to upload a wav file and get the output ## DualCodec-based TTS models Models available: - DualCodec-VALLE: A super fast 12.5Hz VALL-E TTS model based on DualCodec. -- DualCodec-Voicebox: A flow matching decoder for DualCodec 12.5Hz's semantic codes. +- DualCodec-Voicebox: A flow matching decoder for DualCodec 12.5Hz's semantic codes. (this can be used as the second stage of tts). The component alone is not a TTS. To continue, first install other necessary components for training: ```bash -pip install "dualcodec[train]" +pip install "dualcodec[tts]" ``` Alternatively, if you want to install from source, ```bash @@ -170,7 +170,11 @@ pip install -U wandb protobuf transformers ```bash pip install "dualcodec[tts]" ``` -2. Clone this repository and `cd` to the project root folder (the folder that contains this readme). +2. Clone this repository and `cd` to the project root folder (the folder that contains this readme): +```bash +git clone https://github.com/open-mmlab/Amphion.git +cd Amphion/models/codec/dualcodec/ +``` 3. To run example training on example Emilia German data: ```bash @@ -221,11 +225,31 @@ data.segment_speech.segment_length=24000 ## Citation -``` +If you find this work useful for your research, please cite our paper: + +```bibtex @inproceedings{dualcodec, title = {DualCodec: A Low-Frame-Rate, Semantically-Enhanced Neural Audio Codec for Speech Generation}, author = {Li, Jiaqi and Lin, Xiaolong and Li, Zhekai and Huang, Shixi and Wang, Yuancheng and Wang, Chaoren and Zhan, Zhenpeng and Wu, Zhizheng}, booktitle = {Proceedings of Interspeech 2025}, year = {2025} } -``` \ No newline at end of file +``` + +If you use the pre-trained models or training recipe of Amphion, please also cite: + +```bibtex +@article{amphion2, + title = {Overview of the Amphion Toolkit (v0.2)}, + author = {Jiaqi Li and Xueyao Zhang and Yuancheng Wang and Haorui He and Chaoren Wang and Li Wang and Huan Liao and Junyi Ao and Zeyu Xie and Yiqiao Huang and Junan Zhang and Zhizheng Wu}, + year = {2025}, + journal = {arXiv preprint arXiv:2501.15442}, +} + +@inproceedings{amphion, + author={Xueyao Zhang and Liumeng Xue and Yicheng Gu and Yuancheng Wang and Jiaqi Li and Haorui He and Chaoren Wang and Ting Song and Xi Chen and Zihao Fang and Haopeng Chen and Junan Zhang and Tze Ying Tang and Lexiao Zou and Mingxuan Wang and Jun Han and Kai Chen and Haizhou Li and Zhizheng Wu}, + title={Amphion: An Open-Source Audio, Music and Speech Generation Toolkit}, + booktitle={{IEEE} Spoken Language Technology Workshop, {SLT} 2024}, + year={2024} +} +``` diff --git a/models/codec/dualcodec/dualcodec/model_codec/dac_layers.py b/models/codec/dualcodec/dualcodec/model_codec/dac_layers.py new file mode 100644 index 00000000..44fbc292 --- /dev/null +++ b/models/codec/dualcodec/dualcodec/model_codec/dac_layers.py @@ -0,0 +1,33 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from einops import rearrange +from torch.nn.utils import weight_norm + + +def WNConv1d(*args, **kwargs): + return weight_norm(nn.Conv1d(*args, **kwargs)) + + +def WNConvTranspose1d(*args, **kwargs): + return weight_norm(nn.ConvTranspose1d(*args, **kwargs)) + + +# Scripting this brings model speed up 1.4x +@torch.jit.script +def snake(x, alpha): + shape = x.shape + x = x.reshape(shape[0], shape[1], -1) + x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2) + x = x.reshape(shape) + return x + + +class Snake1d(nn.Module): + def __init__(self, channels): + super().__init__() + self.alpha = nn.Parameter(torch.ones(1, channels, 1)) + + def forward(self, x): + return snake(x, self.alpha) diff --git a/models/codec/dualcodec/dualcodec/model_codec/dac_model.py b/models/codec/dualcodec/dualcodec/model_codec/dac_model.py index eb9107ac..4d16b288 100755 --- a/models/codec/dualcodec/dualcodec/model_codec/dac_model.py +++ b/models/codec/dualcodec/dualcodec/model_codec/dac_model.py @@ -15,9 +15,9 @@ from audiotools.ml import BaseModel from torch import nn -from dac.nn.layers import Snake1d -from dac.nn.layers import WNConv1d -from dac.nn.layers import WNConvTranspose1d +from .dac_layers import Snake1d +from .dac_layers import WNConv1d +from .dac_layers import WNConvTranspose1d from .dac_quantize import ResidualVectorQuantize from easydict import EasyDict as edict import torch.nn.functional as F diff --git a/models/codec/dualcodec/dualcodec/model_codec/dac_quantize.py b/models/codec/dualcodec/dualcodec/model_codec/dac_quantize.py index 520109b7..59406096 100755 --- a/models/codec/dualcodec/dualcodec/model_codec/dac_quantize.py +++ b/models/codec/dualcodec/dualcodec/model_codec/dac_quantize.py @@ -17,7 +17,7 @@ from torch.nn.utils import weight_norm except: from torch.nn.utils.parameterizations import weight_norm -from dac.nn.layers import WNConv1d +from .dac_layers import WNConv1d class VectorQuantize(nn.Module): diff --git a/models/codec/dualcodec/dualcodec/model_codec/dualcodec_model.py b/models/codec/dualcodec/dualcodec/model_codec/dualcodec_model.py index f59d1159..5d482ce4 100755 --- a/models/codec/dualcodec/dualcodec/model_codec/dualcodec_model.py +++ b/models/codec/dualcodec/dualcodec/model_codec/dualcodec_model.py @@ -17,9 +17,9 @@ from torch import nn # from .base import CodecMixin -from dac.nn.layers import Snake1d -from dac.nn.layers import WNConv1d -from dac.nn.layers import WNConvTranspose1d +from .dac_layers import Snake1d +from .dac_layers import WNConv1d +from .dac_layers import WNConvTranspose1d from .dac_quantize import ResidualVectorQuantize from easydict import EasyDict as edict import torch.nn.functional as F diff --git a/models/codec/dualcodec/pyproject.toml b/models/codec/dualcodec/pyproject.toml index b845b445..26b97fe2 100644 --- a/models/codec/dualcodec/pyproject.toml +++ b/models/codec/dualcodec/pyproject.toml @@ -1,10 +1,10 @@ [project] name = "dualcodec" -version = "0.3.7" +version = "0.4.0" description = "The DualCodec neural audio codec." dependencies = [ "transformers>=4.30.0", - "descript-audio-codec", + "descript-audiotools>=0.7.2", "huggingface_hub[cli]", "easydict", "torch", diff --git a/models/codec/dualcodec/setup.py b/models/codec/dualcodec/setup.py new file mode 100644 index 00000000..976d159e --- /dev/null +++ b/models/codec/dualcodec/setup.py @@ -0,0 +1,19 @@ +from setuptools import setup + +setup( + name="dualcodec", + packages=["dualcodec"], + install_requires=[ + "transformers>=4.30.0", + "descript-audiotools>=0.7.2", + "huggingface_hub[cli]", + "easydict", + "torch", + "torchaudio", + "hydra-core", + "einops", + "safetensors", + "cached_path", + ], + python_requires=">=3.9", +)