Skip to content

Commit 8c20a59

Browse files
committed
Remove coqui studio integration from TTS
1 parent 5cd750a commit 8c20a59

File tree

11 files changed

+33
-782
lines changed

11 files changed

+33
-782
lines changed

.github/workflows/api_tests.yml

-53
This file was deleted.

Makefile

-3
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,6 @@ test_zoo: ## run zoo tests.
3535
inference_tests: ## run inference tests.
3636
nose2 -F -v -B --with-coverage --coverage TTS tests.inference_tests
3737

38-
api_tests: ## run api tests.
39-
nose2 -F -v -B --with-coverage --coverage TTS tests.api_tests
40-
4138
data_tests: ## run data tests.
4239
nose2 -F -v -B --with-coverage --coverage TTS tests.data_tests
4340

README.md

-29
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,6 @@
77
- 📣 [🐶Bark](https://github.com/suno-ai/bark) is now available for inference with unconstrained voice cloning. [Docs](https://tts.readthedocs.io/en/dev/models/bark.html)
88
- 📣 You can use [~1100 Fairseq models](https://github.com/facebookresearch/fairseq/tree/main/examples/mms) with 🐸TTS.
99
- 📣 🐸TTS now supports 🐢Tortoise with faster inference. [Docs](https://tts.readthedocs.io/en/dev/models/tortoise.html)
10-
- 📣 **Coqui Studio API** is landed on 🐸TTS. - [Example](https://github.com/coqui-ai/TTS/blob/dev/README.md#-python-api)
11-
- 📣 [**Coqui Studio API**](https://docs.coqui.ai/docs) is live.
1210
- 📣 Voice generation with prompts - **Prompt to Voice** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin)!! - [Blog Post](https://coqui.ai/blog/tts/prompt-to-voice)
1311
- 📣 Voice generation with fusion - **Voice fusion** - is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).
1412
- 📣 Voice cloning is live on [**Coqui Studio**](https://app.coqui.ai/auth/signin).
@@ -253,29 +251,6 @@ tts.tts_with_vc_to_file(
253251
)
254252
```
255253

256-
#### Example using [🐸Coqui Studio](https://coqui.ai) voices.
257-
You access all of your cloned voices and built-in speakers in [🐸Coqui Studio](https://coqui.ai).
258-
To do this, you'll need an API token, which you can obtain from the [account page](https://coqui.ai/account).
259-
After obtaining the API token, you'll need to configure the COQUI_STUDIO_TOKEN environment variable.
260-
261-
Once you have a valid API token in place, the studio speakers will be displayed as distinct models within the list.
262-
These models will follow the naming convention `coqui_studio/en/<studio_speaker_name>/coqui_studio`
263-
264-
```python
265-
# XTTS model
266-
models = TTS(cs_api_model="XTTS").list_models()
267-
# Init TTS with the target studio speaker
268-
tts = TTS(model_name="coqui_studio/en/Torcull Diarmuid/coqui_studio", progress_bar=False)
269-
# Run TTS
270-
tts.tts_to_file(text="This is a test.", language="en", file_path=OUTPUT_PATH)
271-
272-
# V1 model
273-
models = TTS(cs_api_model="V1").list_models()
274-
# Run TTS with emotion and speed control
275-
# Emotion control only works with V1 model
276-
tts.tts_to_file(text="This is a test.", file_path=OUTPUT_PATH, emotion="Happy", speed=1.5)
277-
```
278-
279254
#### Example text to speech using **Fairseq models in ~1100 languages** 🤯.
280255
For Fairseq models, use the following name format: `tts_models/<lang-iso_code>/fairseq/vits`.
281256
You can find the language ISO codes [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html)
@@ -353,10 +328,6 @@ If you don't specify any models, then it uses LJSpeech based English model.
353328
354329
- Run TTS and define speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0:
355330
356-
```
357-
$ tts --text "Text for TTS" --model_name "coqui_studio/<language>/<dataset>/<model_name>" --speed 1.2 --out_path output/path/speech.wav
358-
```
359-
360331
- Run a TTS model with its default vocoder model:
361332
362333
```

TTS/api.py

+30-128
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import numpy as np
77
from torch import nn
88

9-
from TTS.cs_api import CS_API
109
from TTS.utils.audio.numpy_transforms import save_wav
1110
from TTS.utils.manage import ModelManager
1211
from TTS.utils.synthesizer import Synthesizer
@@ -24,7 +23,6 @@ def __init__(
2423
vocoder_path: str = None,
2524
vocoder_config_path: str = None,
2625
progress_bar: bool = True,
27-
cs_api_model: str = "XTTS",
2826
gpu=False,
2927
):
3028
"""🐸TTS python interface that allows to load and use the released models.
@@ -60,24 +58,19 @@ def __init__(
6058
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
6159
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
6260
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
63-
cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
64-
"XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
65-
Defaults to "XTTS".
6661
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
6762
"""
6863
super().__init__()
6964
self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar, verbose=False)
7065
self.config = load_config(config_path) if config_path else None
7166
self.synthesizer = None
7267
self.voice_converter = None
73-
self.csapi = None
74-
self.cs_api_model = cs_api_model
7568
self.model_name = ""
7669
if gpu:
7770
warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
7871

7972
if model_name is not None and len(model_name) > 0:
80-
if "tts_models" in model_name or "coqui_studio" in model_name:
73+
if "tts_models" in model_name:
8174
self.load_tts_model_by_name(model_name, gpu)
8275
elif "voice_conversion_models" in model_name:
8376
self.load_vc_model_by_name(model_name, gpu)
@@ -99,12 +92,6 @@ def is_multi_speaker(self):
9992
return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
10093
return False
10194

102-
@property
103-
def is_coqui_studio(self):
104-
if self.model_name is None:
105-
return False
106-
return "coqui_studio" in self.model_name
107-
10895
@property
10996
def is_multi_lingual(self):
11097
# Not sure what sets this to None, but applied a fix to prevent crashing.
@@ -136,14 +123,7 @@ def get_models_file_path():
136123
return Path(__file__).parent / ".models.json"
137124

138125
def list_models(self):
139-
try:
140-
csapi = CS_API(model=self.cs_api_model)
141-
models = csapi.list_speakers_as_tts_models()
142-
except ValueError as e:
143-
print(e)
144-
models = []
145-
manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
146-
return manager.list_tts_models() + models
126+
return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False, verbose=False)
147127

148128
def download_model_by_name(self, model_name: str):
149129
model_path, config_path, model_item = self.manager.download_model(model_name)
@@ -186,30 +166,26 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
186166
TODO: Add tests
187167
"""
188168
self.synthesizer = None
189-
self.csapi = None
190169
self.model_name = model_name
191170

192-
if "coqui_studio" in model_name:
193-
self.csapi = CS_API()
194-
else:
195-
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
196-
model_name
197-
)
171+
model_path, config_path, vocoder_path, vocoder_config_path, model_dir = self.download_model_by_name(
172+
model_name
173+
)
198174

199-
# init synthesizer
200-
# None values are fetch from the model
201-
self.synthesizer = Synthesizer(
202-
tts_checkpoint=model_path,
203-
tts_config_path=config_path,
204-
tts_speakers_file=None,
205-
tts_languages_file=None,
206-
vocoder_checkpoint=vocoder_path,
207-
vocoder_config=vocoder_config_path,
208-
encoder_checkpoint=None,
209-
encoder_config=None,
210-
model_dir=model_dir,
211-
use_cuda=gpu,
212-
)
175+
# init synthesizer
176+
# None values are fetch from the model
177+
self.synthesizer = Synthesizer(
178+
tts_checkpoint=model_path,
179+
tts_config_path=config_path,
180+
tts_speakers_file=None,
181+
tts_languages_file=None,
182+
vocoder_checkpoint=vocoder_path,
183+
vocoder_config=vocoder_config_path,
184+
encoder_checkpoint=None,
185+
encoder_config=None,
186+
model_dir=model_dir,
187+
use_cuda=gpu,
188+
)
213189

214190
def load_tts_model_by_path(
215191
self, model_path: str, config_path: str, vocoder_path: str = None, vocoder_config: str = None, gpu: bool = False
@@ -246,77 +222,17 @@ def _check_arguments(
246222
**kwargs,
247223
) -> None:
248224
"""Check if the arguments are valid for the model."""
249-
if not self.is_coqui_studio:
250-
# check for the coqui tts models
251-
if self.is_multi_speaker and (speaker is None and speaker_wav is None):
252-
raise ValueError("Model is multi-speaker but no `speaker` is provided.")
253-
if self.is_multi_lingual and language is None:
254-
raise ValueError("Model is multi-lingual but no `language` is provided.")
255-
if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
256-
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
257-
if not self.is_multi_lingual and language is not None:
258-
raise ValueError("Model is not multi-lingual but `language` is provided.")
259-
if not emotion is None and not speed is None:
260-
raise ValueError("Emotion and speed can only be used with Coqui Studio models.")
261-
else:
262-
if emotion is None:
263-
emotion = "Neutral"
264-
if speed is None:
265-
speed = 1.0
266-
# check for the studio models
267-
if speaker_wav is not None:
268-
raise ValueError("Coqui Studio models do not support `speaker_wav` argument.")
269-
if speaker is not None:
270-
raise ValueError("Coqui Studio models do not support `speaker` argument.")
271-
if language is not None and language != "en":
272-
raise ValueError("Coqui Studio models currently support only `language=en` argument.")
273-
if emotion not in ["Neutral", "Happy", "Sad", "Angry", "Dull"]:
274-
raise ValueError(f"Emotion - `{emotion}` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`.")
275-
276-
def tts_coqui_studio(
277-
self,
278-
text: str,
279-
speaker_name: str = None,
280-
language: str = None,
281-
emotion: str = None,
282-
speed: float = 1.0,
283-
pipe_out=None,
284-
file_path: str = None,
285-
) -> Union[np.ndarray, str]:
286-
"""Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
287-
288-
Args:
289-
text (str):
290-
Input text to synthesize.
291-
speaker_name (str, optional):
292-
Speaker name from Coqui Studio. Defaults to None.
293-
language (str): Language of the text. If None, the default language of the speaker is used. Language is only
294-
supported by `XTTS` model.
295-
emotion (str, optional):
296-
Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
297-
with "V1" model. Defaults to None.
298-
speed (float, optional):
299-
Speed of the speech. Defaults to 1.0.
300-
pipe_out (BytesIO, optional):
301-
Flag to stdout the generated TTS wav file for shell pipe.
302-
file_path (str, optional):
303-
Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
304-
305-
Returns:
306-
Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
307-
"""
308-
speaker_name = self.model_name.split("/")[2]
309-
if file_path is not None:
310-
return self.csapi.tts_to_file(
311-
text=text,
312-
speaker_name=speaker_name,
313-
language=language,
314-
speed=speed,
315-
pipe_out=pipe_out,
316-
emotion=emotion,
317-
file_path=file_path,
318-
)[0]
319-
return self.csapi.tts(text=text, speaker_name=speaker_name, language=language, speed=speed, emotion=emotion)[0]
225+
# check for the coqui tts models
226+
if self.is_multi_speaker and (speaker is None and speaker_wav is None):
227+
raise ValueError("Model is multi-speaker but no `speaker` is provided.")
228+
if self.is_multi_lingual and language is None:
229+
raise ValueError("Model is multi-lingual but no `language` is provided.")
230+
if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
231+
raise ValueError("Model is not multi-speaker but `speaker` is provided.")
232+
if not self.is_multi_lingual and language is not None:
233+
raise ValueError("Model is not multi-lingual but `language` is provided.")
234+
if not emotion is None and not speed is None:
235+
raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
320236

321237
def tts(
322238
self,
@@ -357,10 +273,6 @@ def tts(
357273
self._check_arguments(
358274
speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
359275
)
360-
if self.csapi is not None:
361-
return self.tts_coqui_studio(
362-
text=text, speaker_name=speaker, language=language, emotion=emotion, speed=speed
363-
)
364276
wav = self.synthesizer.tts(
365277
text=text,
366278
speaker_name=speaker,
@@ -419,16 +331,6 @@ def tts_to_file(
419331
"""
420332
self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
421333

422-
if self.csapi is not None:
423-
return self.tts_coqui_studio(
424-
text=text,
425-
speaker_name=speaker,
426-
language=language,
427-
emotion=emotion,
428-
speed=speed,
429-
file_path=file_path,
430-
pipe_out=pipe_out,
431-
)
432334
wav = self.tts(
433335
text=text,
434336
speaker=speaker,

0 commit comments

Comments
 (0)