diff --git a/zipvoice/modeling_utils.py b/zipvoice/modeling_utils.py index f4e621d..e5b038d 100644 --- a/zipvoice/modeling_utils.py +++ b/zipvoice/modeling_utils.py @@ -47,7 +47,7 @@ class LuxTTSConfig: def process_audio(audio, transcriber, tokenizer, feature_extractor, device, target_rms=0.1, duration=4, feat_scale=0.1): prompt_wav, sr = librosa.load(audio, sr=24000, duration=duration) prompt_wav2, sr = librosa.load(audio, sr=16000, duration=duration) - prompt_text = transcriber(prompt_wav2)["text"] + prompt_text = transcriber(prompt_wav2, return_timestamps=True)["text"] print(prompt_text) prompt_wav = torch.from_numpy(prompt_wav).unsqueeze(0)