@@ -101,7 +101,7 @@ def load_discrete_vocoder_diffuser(trained_diffusion_steps=4000, desired_diffusi
101
101
conditioning_free = cond_free , conditioning_free_k = cond_free_k )
102
102
103
103
104
- def format_conditioning (clip , cond_length = 132300 ):
104
+ def format_conditioning (clip , cond_length = 132300 , device = 'cuda' ):
105
105
"""
106
106
Converts the given conditioning signal to a MEL spectrogram and clips it as expected by the models.
107
107
"""
@@ -112,7 +112,7 @@ def format_conditioning(clip, cond_length=132300):
112
112
rand_start = random .randint (0 , gap )
113
113
clip = clip [:, rand_start :rand_start + cond_length ]
114
114
mel_clip = TorchMelSpectrogram ()(clip .unsqueeze (0 )).squeeze (0 )
115
- return mel_clip .unsqueeze (0 ).cuda ( )
115
+ return mel_clip .unsqueeze (0 ).to ( device )
116
116
117
117
118
118
def fix_autoregressive_output (codes , stop_token , complain = True ):
@@ -181,14 +181,15 @@ def pick_best_batch_size_for_gpu():
181
181
Tries to pick a batch size that will fit in your GPU. These sizes aren't guaranteed to work, but they should give
182
182
you a good shot.
183
183
"""
184
- free , available = torch .cuda .mem_get_info ()
185
- availableGb = available / (1024 ** 3 )
186
- if availableGb > 14 :
187
- return 16
188
- elif availableGb > 10 :
189
- return 8
190
- elif availableGb > 7 :
191
- return 4
184
+ if torch .cuda .is_available ():
185
+ _ , available = torch .cuda .mem_get_info ()
186
+ availableGb = available / (1024 ** 3 )
187
+ if availableGb > 14 :
188
+ return 16
189
+ elif availableGb > 10 :
190
+ return 8
191
+ elif availableGb > 7 :
192
+ return 4
192
193
return 1
193
194
194
195
@@ -197,7 +198,7 @@ class TextToSpeech:
197
198
Main entry point into Tortoise.
198
199
"""
199
200
200
- def __init__ (self , autoregressive_batch_size = None , models_dir = MODELS_DIR , enable_redaction = True ):
201
+ def __init__ (self , autoregressive_batch_size = None , models_dir = MODELS_DIR , enable_redaction = True , device = None ):
201
202
"""
202
203
Constructor
203
204
:param autoregressive_batch_size: Specifies how many samples to generate per batch. Lower this if you are seeing
@@ -207,10 +208,12 @@ def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable
207
208
:param enable_redaction: When true, text enclosed in brackets are automatically redacted from the spoken output
208
209
(but are still rendered by the model). This can be used for prompt engineering.
209
210
Default is true.
211
+ :param device: Device to use when running the model. If omitted, the device will be automatically chosen.
210
212
"""
211
213
self .models_dir = models_dir
212
214
self .autoregressive_batch_size = pick_best_batch_size_for_gpu () if autoregressive_batch_size is None else autoregressive_batch_size
213
215
self .enable_redaction = enable_redaction
216
+ self .device = torch .device ('cuda' if torch .cuda .is_available () else 'cpu' )
214
217
if self .enable_redaction :
215
218
self .aligner = Wav2VecAlignment ()
216
219
@@ -240,7 +243,7 @@ def __init__(self, autoregressive_batch_size=None, models_dir=MODELS_DIR, enable
240
243
self .cvvp = None # CVVP model is only loaded if used.
241
244
242
245
self .vocoder = UnivNetGenerator ().cpu ()
243
- self .vocoder .load_state_dict (torch .load (get_model_path ('vocoder.pth' , models_dir ))['model_g' ])
246
+ self .vocoder .load_state_dict (torch .load (get_model_path ('vocoder.pth' , models_dir ), map_location = torch . device ( 'cpu' ) )['model_g' ])
244
247
self .vocoder .eval (inference = True )
245
248
246
249
# Random latent generators (RLGs) are loaded lazily.
@@ -261,15 +264,15 @@ def get_conditioning_latents(self, voice_samples, return_mels=False):
261
264
:param voice_samples: List of 2 or more ~10 second reference clips, which should be torch tensors containing 22.05kHz waveform data.
262
265
"""
263
266
with torch .no_grad ():
264
- voice_samples = [v .to ('cuda' ) for v in voice_samples ]
267
+ voice_samples = [v .to (self . device ) for v in voice_samples ]
265
268
266
269
auto_conds = []
267
270
if not isinstance (voice_samples , list ):
268
271
voice_samples = [voice_samples ]
269
272
for vs in voice_samples :
270
- auto_conds .append (format_conditioning (vs ))
273
+ auto_conds .append (format_conditioning (vs , device = self . device ))
271
274
auto_conds = torch .stack (auto_conds , dim = 1 )
272
- self .autoregressive = self .autoregressive .cuda ( )
275
+ self .autoregressive = self .autoregressive .to ( self . device )
273
276
auto_latent = self .autoregressive .get_conditioning (auto_conds )
274
277
self .autoregressive = self .autoregressive .cpu ()
275
278
@@ -278,11 +281,11 @@ def get_conditioning_latents(self, voice_samples, return_mels=False):
278
281
# The diffuser operates at a sample rate of 24000 (except for the latent inputs)
279
282
sample = torchaudio .functional .resample (sample , 22050 , 24000 )
280
283
sample = pad_or_truncate (sample , 102400 )
281
- cond_mel = wav_to_univnet_mel (sample .to ('cuda' ), do_normalization = False )
284
+ cond_mel = wav_to_univnet_mel (sample .to (self . device ), do_normalization = False , device = self . device )
282
285
diffusion_conds .append (cond_mel )
283
286
diffusion_conds = torch .stack (diffusion_conds , dim = 1 )
284
287
285
- self .diffusion = self .diffusion .cuda ( )
288
+ self .diffusion = self .diffusion .to ( self . device )
286
289
diffusion_latent = self .diffusion .get_conditioning (diffusion_conds )
287
290
self .diffusion = self .diffusion .cpu ()
288
291
@@ -380,7 +383,7 @@ def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=
380
383
"""
381
384
deterministic_seed = self .deterministic_state (seed = use_deterministic_seed )
382
385
383
- text_tokens = torch .IntTensor (self .tokenizer .encode (text )).unsqueeze (0 ).cuda ( )
386
+ text_tokens = torch .IntTensor (self .tokenizer .encode (text )).unsqueeze (0 ).to ( self . device )
384
387
text_tokens = F .pad (text_tokens , (0 , 1 )) # This may not be necessary.
385
388
assert text_tokens .shape [- 1 ] < 400 , 'Too much text provided. Break the text up into separate segments and re-try inference.'
386
389
@@ -391,8 +394,8 @@ def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=
391
394
auto_conditioning , diffusion_conditioning = conditioning_latents
392
395
else :
393
396
auto_conditioning , diffusion_conditioning = self .get_random_conditioning_latents ()
394
- auto_conditioning = auto_conditioning .cuda ( )
395
- diffusion_conditioning = diffusion_conditioning .cuda ( )
397
+ auto_conditioning = auto_conditioning .to ( self . device )
398
+ diffusion_conditioning = diffusion_conditioning .to ( self . device )
396
399
397
400
diffuser = load_discrete_vocoder_diffuser (desired_diffusion_steps = diffusion_iterations , cond_free = cond_free , cond_free_k = cond_free_k )
398
401
@@ -401,7 +404,7 @@ def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=
401
404
num_batches = num_autoregressive_samples // self .autoregressive_batch_size
402
405
stop_mel_token = self .autoregressive .stop_mel_token
403
406
calm_token = 83 # This is the token for coding silence, which is fixed in place with "fix_autoregressive_output"
404
- self .autoregressive = self .autoregressive .cuda ( )
407
+ self .autoregressive = self .autoregressive .to ( self . device )
405
408
if verbose :
406
409
print ("Generating autoregressive samples.." )
407
410
for b in tqdm (range (num_batches ), disable = not verbose ):
@@ -420,11 +423,11 @@ def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=
420
423
self .autoregressive = self .autoregressive .cpu ()
421
424
422
425
clip_results = []
423
- self .clvp = self .clvp .cuda ( )
426
+ self .clvp = self .clvp .to ( self . device )
424
427
if cvvp_amount > 0 :
425
428
if self .cvvp is None :
426
429
self .load_cvvp ()
427
- self .cvvp = self .cvvp .cuda ( )
430
+ self .cvvp = self .cvvp .to ( self . device )
428
431
if verbose :
429
432
if self .cvvp is None :
430
433
print ("Computing best candidates using CLVP" )
@@ -457,7 +460,7 @@ def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=
457
460
# The diffusion model actually wants the last hidden layer from the autoregressive model as conditioning
458
461
# inputs. Re-produce those for the top results. This could be made more efficient by storing all of these
459
462
# results, but will increase memory usage.
460
- self .autoregressive = self .autoregressive .cuda ( )
463
+ self .autoregressive = self .autoregressive .to ( self . device )
461
464
best_latents = self .autoregressive (auto_conditioning .repeat (k , 1 ), text_tokens .repeat (k , 1 ),
462
465
torch .tensor ([text_tokens .shape [- 1 ]], device = text_tokens .device ), best_results ,
463
466
torch .tensor ([best_results .shape [- 1 ]* self .autoregressive .mel_length_compression ], device = text_tokens .device ),
@@ -468,8 +471,8 @@ def tts(self, text, voice_samples=None, conditioning_latents=None, k=1, verbose=
468
471
if verbose :
469
472
print ("Transforming autoregressive outputs into audio.." )
470
473
wav_candidates = []
471
- self .diffusion = self .diffusion .cuda ( )
472
- self .vocoder = self .vocoder .cuda ( )
474
+ self .diffusion = self .diffusion .to ( self . device )
475
+ self .vocoder = self .vocoder .to ( self . device )
473
476
for b in range (best_results .shape [0 ]):
474
477
codes = best_results [b ].unsqueeze (0 )
475
478
latents = best_latents [b ].unsqueeze (0 )
0 commit comments