-
Notifications
You must be signed in to change notification settings - Fork 20
/
predict.py
46 lines (42 loc) · 1.95 KB
/
predict.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# Prediction interface for Cog
from cog import BasePredictor, Input, Path
import os
from TTS.api import TTS
class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""
os.environ["COQUI_TOS_AGREED"] = "1"
self.model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to('cuda')
def predict(
self,
text: str = Input(
description="Text to synthesize",
default="Hi there, I'm your new voice clone. Try your best to upload quality audio"
),
speaker: Path = Input(description="Original speaker audio (wav, mp3, m4a, ogg, or flv). Duration should be at least 6 seconds."),
language: str = Input(
description="Output language for the synthesised speech",
choices=["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh", "hu", "ko", "hi"],
default="en"
),
cleanup_voice: bool = Input(
description="Whether to apply denoising to the speaker audio (microphone recordings)",
default=False
),
) -> Path:
"""Run a single prediction on the model"""
speaker_wav = "/tmp/speaker.wav"
filter = "highpass=75,lowpass=8000,"
trim_silence = "areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02,areverse,silenceremove=start_periods=1:start_silence=0:start_threshold=0.02"
# ffmpeg convert to wav and apply afftn denoise filter. y to overwrite and avoid caching
if cleanup_voice:
os.system(f"ffmpeg -i {speaker} -af {filter}{trim_silence} -y {speaker_wav}")
else:
os.system(f"ffmpeg -i {speaker} -y {speaker_wav}")
path = self.model.tts_to_file(
text=text,
file_path = "/tmp/output.wav",
speaker_wav = speaker_wav,
language = language
)
return Path(path)