6
6
import numpy as np
7
7
from torch import nn
8
8
9
- from TTS .cs_api import CS_API
10
9
from TTS .utils .audio .numpy_transforms import save_wav
11
10
from TTS .utils .manage import ModelManager
12
11
from TTS .utils .synthesizer import Synthesizer
@@ -24,7 +23,6 @@ def __init__(
24
23
vocoder_path : str = None ,
25
24
vocoder_config_path : str = None ,
26
25
progress_bar : bool = True ,
27
- cs_api_model : str = "XTTS" ,
28
26
gpu = False ,
29
27
):
30
28
"""🐸TTS python interface that allows to load and use the released models.
@@ -60,24 +58,19 @@ def __init__(
60
58
vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
61
59
vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
62
60
progress_bar (bool, optional): Whether to pring a progress bar while downloading a model. Defaults to True.
63
- cs_api_model (str, optional): Name of the model to use for the Coqui Studio API. Available models are
64
- "XTTS", "V1". You can also use `TTS.cs_api.CS_API" for more control.
65
- Defaults to "XTTS".
66
61
gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
67
62
"""
68
63
super ().__init__ ()
69
64
self .manager = ModelManager (models_file = self .get_models_file_path (), progress_bar = progress_bar , verbose = False )
70
65
self .config = load_config (config_path ) if config_path else None
71
66
self .synthesizer = None
72
67
self .voice_converter = None
73
- self .csapi = None
74
- self .cs_api_model = cs_api_model
75
68
self .model_name = ""
76
69
if gpu :
77
70
warnings .warn ("`gpu` will be deprecated. Please use `tts.to(device)` instead." )
78
71
79
72
if model_name is not None and len (model_name ) > 0 :
80
- if "tts_models" in model_name or "coqui_studio" in model_name :
73
+ if "tts_models" in model_name :
81
74
self .load_tts_model_by_name (model_name , gpu )
82
75
elif "voice_conversion_models" in model_name :
83
76
self .load_vc_model_by_name (model_name , gpu )
@@ -99,12 +92,6 @@ def is_multi_speaker(self):
99
92
return self .synthesizer .tts_model .speaker_manager .num_speakers > 1
100
93
return False
101
94
102
- @property
103
- def is_coqui_studio (self ):
104
- if self .model_name is None :
105
- return False
106
- return "coqui_studio" in self .model_name
107
-
108
95
@property
109
96
def is_multi_lingual (self ):
110
97
# Not sure what sets this to None, but applied a fix to prevent crashing.
@@ -136,14 +123,7 @@ def get_models_file_path():
136
123
return Path (__file__ ).parent / ".models.json"
137
124
138
125
def list_models (self ):
139
- try :
140
- csapi = CS_API (model = self .cs_api_model )
141
- models = csapi .list_speakers_as_tts_models ()
142
- except ValueError as e :
143
- print (e )
144
- models = []
145
- manager = ModelManager (models_file = TTS .get_models_file_path (), progress_bar = False , verbose = False )
146
- return manager .list_tts_models () + models
126
+ return ModelManager (models_file = TTS .get_models_file_path (), progress_bar = False , verbose = False )
147
127
148
128
def download_model_by_name (self , model_name : str ):
149
129
model_path , config_path , model_item = self .manager .download_model (model_name )
@@ -186,30 +166,26 @@ def load_tts_model_by_name(self, model_name: str, gpu: bool = False):
186
166
TODO: Add tests
187
167
"""
188
168
self .synthesizer = None
189
- self .csapi = None
190
169
self .model_name = model_name
191
170
192
- if "coqui_studio" in model_name :
193
- self .csapi = CS_API ()
194
- else :
195
- model_path , config_path , vocoder_path , vocoder_config_path , model_dir = self .download_model_by_name (
196
- model_name
197
- )
171
+ model_path , config_path , vocoder_path , vocoder_config_path , model_dir = self .download_model_by_name (
172
+ model_name
173
+ )
198
174
199
- # init synthesizer
200
- # None values are fetch from the model
201
- self .synthesizer = Synthesizer (
202
- tts_checkpoint = model_path ,
203
- tts_config_path = config_path ,
204
- tts_speakers_file = None ,
205
- tts_languages_file = None ,
206
- vocoder_checkpoint = vocoder_path ,
207
- vocoder_config = vocoder_config_path ,
208
- encoder_checkpoint = None ,
209
- encoder_config = None ,
210
- model_dir = model_dir ,
211
- use_cuda = gpu ,
212
- )
175
+ # init synthesizer
176
+ # None values are fetch from the model
177
+ self .synthesizer = Synthesizer (
178
+ tts_checkpoint = model_path ,
179
+ tts_config_path = config_path ,
180
+ tts_speakers_file = None ,
181
+ tts_languages_file = None ,
182
+ vocoder_checkpoint = vocoder_path ,
183
+ vocoder_config = vocoder_config_path ,
184
+ encoder_checkpoint = None ,
185
+ encoder_config = None ,
186
+ model_dir = model_dir ,
187
+ use_cuda = gpu ,
188
+ )
213
189
214
190
def load_tts_model_by_path (
215
191
self , model_path : str , config_path : str , vocoder_path : str = None , vocoder_config : str = None , gpu : bool = False
@@ -246,77 +222,17 @@ def _check_arguments(
246
222
** kwargs ,
247
223
) -> None :
248
224
"""Check if the arguments are valid for the model."""
249
- if not self .is_coqui_studio :
250
- # check for the coqui tts models
251
- if self .is_multi_speaker and (speaker is None and speaker_wav is None ):
252
- raise ValueError ("Model is multi-speaker but no `speaker` is provided." )
253
- if self .is_multi_lingual and language is None :
254
- raise ValueError ("Model is multi-lingual but no `language` is provided." )
255
- if not self .is_multi_speaker and speaker is not None and "voice_dir" not in kwargs :
256
- raise ValueError ("Model is not multi-speaker but `speaker` is provided." )
257
- if not self .is_multi_lingual and language is not None :
258
- raise ValueError ("Model is not multi-lingual but `language` is provided." )
259
- if not emotion is None and not speed is None :
260
- raise ValueError ("Emotion and speed can only be used with Coqui Studio models." )
261
- else :
262
- if emotion is None :
263
- emotion = "Neutral"
264
- if speed is None :
265
- speed = 1.0
266
- # check for the studio models
267
- if speaker_wav is not None :
268
- raise ValueError ("Coqui Studio models do not support `speaker_wav` argument." )
269
- if speaker is not None :
270
- raise ValueError ("Coqui Studio models do not support `speaker` argument." )
271
- if language is not None and language != "en" :
272
- raise ValueError ("Coqui Studio models currently support only `language=en` argument." )
273
- if emotion not in ["Neutral" , "Happy" , "Sad" , "Angry" , "Dull" ]:
274
- raise ValueError (f"Emotion - `{ emotion } ` - must be one of `Neutral`, `Happy`, `Sad`, `Angry`, `Dull`." )
275
-
276
- def tts_coqui_studio (
277
- self ,
278
- text : str ,
279
- speaker_name : str = None ,
280
- language : str = None ,
281
- emotion : str = None ,
282
- speed : float = 1.0 ,
283
- pipe_out = None ,
284
- file_path : str = None ,
285
- ) -> Union [np .ndarray , str ]:
286
- """Convert text to speech using Coqui Studio models. Use `CS_API` class if you are only interested in the API.
287
-
288
- Args:
289
- text (str):
290
- Input text to synthesize.
291
- speaker_name (str, optional):
292
- Speaker name from Coqui Studio. Defaults to None.
293
- language (str): Language of the text. If None, the default language of the speaker is used. Language is only
294
- supported by `XTTS` model.
295
- emotion (str, optional):
296
- Emotion of the speaker. One of "Neutral", "Happy", "Sad", "Angry", "Dull". Emotions are only available
297
- with "V1" model. Defaults to None.
298
- speed (float, optional):
299
- Speed of the speech. Defaults to 1.0.
300
- pipe_out (BytesIO, optional):
301
- Flag to stdout the generated TTS wav file for shell pipe.
302
- file_path (str, optional):
303
- Path to save the output file. When None it returns the `np.ndarray` of waveform. Defaults to None.
304
-
305
- Returns:
306
- Union[np.ndarray, str]: Waveform of the synthesized speech or path to the output file.
307
- """
308
- speaker_name = self .model_name .split ("/" )[2 ]
309
- if file_path is not None :
310
- return self .csapi .tts_to_file (
311
- text = text ,
312
- speaker_name = speaker_name ,
313
- language = language ,
314
- speed = speed ,
315
- pipe_out = pipe_out ,
316
- emotion = emotion ,
317
- file_path = file_path ,
318
- )[0 ]
319
- return self .csapi .tts (text = text , speaker_name = speaker_name , language = language , speed = speed , emotion = emotion )[0 ]
225
+ # check for the coqui tts models
226
+ if self .is_multi_speaker and (speaker is None and speaker_wav is None ):
227
+ raise ValueError ("Model is multi-speaker but no `speaker` is provided." )
228
+ if self .is_multi_lingual and language is None :
229
+ raise ValueError ("Model is multi-lingual but no `language` is provided." )
230
+ if not self .is_multi_speaker and speaker is not None and "voice_dir" not in kwargs :
231
+ raise ValueError ("Model is not multi-speaker but `speaker` is provided." )
232
+ if not self .is_multi_lingual and language is not None :
233
+ raise ValueError ("Model is not multi-lingual but `language` is provided." )
234
+ if not emotion is None and not speed is None :
235
+ raise ValueError ("Emotion and speed can only be used with Coqui Studio models. Which is discontinued." )
320
236
321
237
def tts (
322
238
self ,
@@ -357,10 +273,6 @@ def tts(
357
273
self ._check_arguments (
358
274
speaker = speaker , language = language , speaker_wav = speaker_wav , emotion = emotion , speed = speed , ** kwargs
359
275
)
360
- if self .csapi is not None :
361
- return self .tts_coqui_studio (
362
- text = text , speaker_name = speaker , language = language , emotion = emotion , speed = speed
363
- )
364
276
wav = self .synthesizer .tts (
365
277
text = text ,
366
278
speaker_name = speaker ,
@@ -419,16 +331,6 @@ def tts_to_file(
419
331
"""
420
332
self ._check_arguments (speaker = speaker , language = language , speaker_wav = speaker_wav , ** kwargs )
421
333
422
- if self .csapi is not None :
423
- return self .tts_coqui_studio (
424
- text = text ,
425
- speaker_name = speaker ,
426
- language = language ,
427
- emotion = emotion ,
428
- speed = speed ,
429
- file_path = file_path ,
430
- pipe_out = pipe_out ,
431
- )
432
334
wav = self .tts (
433
335
text = text ,
434
336
speaker = speaker ,
0 commit comments