-
-
Notifications
You must be signed in to change notification settings - Fork 1k
Open
Description
我在作者的调用TTS 方法中做了点变更,使其可以按照中文的(。!?)进行断句,然后一句一句传给TTS。
class FishTTS(BaseTTS):
def txt_to_audio(self, msg):
text, textevent = msg
# 按中文和英文标点切分成句子
sentences = re.split(r'([。!?])', text)
# 把分隔符拼回去,得到完整的句子列表
combined = []
for i in range(0, len(sentences), 2):
s = sentences[i].strip()
if not s:
continue
if i + 1 < len(sentences):
s += sentences[i + 1]
combined.append(s)
# 逐句调用 fish_speech 并播放
for sent in combined:
audio_stream = self.fish_speech(
sent,
self.opt.REF_FILE,
self.opt.REF_TEXT,
"zh",
self.opt.TTS_SERVER,
)
self.stream_tts(audio_stream, (sent, textevent))
def fish_speech(self, text, reffile, reftext, language, server_url) -> Iterator[bytes]:
start = time.perf_counter()
req = {
'text': text,
'reference_id': reffile,
'format': 'wav',
'streaming': True,
'use_memory_cache': 'on'
}
try:
res = requests.post(
f"{server_url}/v1/tts",
json=req,
stream=True,
headers={
"content-type": "application/json",
},
)
end = time.perf_counter()
logger.info(f"fish_speech Time to make POST: {end - start}s")
if res.status_code != 200:
logger.error("Error:%s", res.text)
return
first = True
for chunk in res.iter_content(chunk_size=17640): # 1764 44100*20ms*2
if first:
end = time.perf_counter()
logger.info(f"fish_speech Time to first chunk: {end - start}s")
first = False
if chunk and self.state == State.RUNNING:
yield chunk
except Exception as e:
logger.exception('fishtts')
def stream_tts(self, audio_stream, msg):
text, textevent = msg
first = True
for chunk in audio_stream:
if chunk is not None and len(chunk) > 0:
stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
stream = resampy.resample(x=stream, sr_orig=44100, sr_new=self.sample_rate)
streamlen = stream.shape[0]
idx = 0
while streamlen >= self.chunk:
eventpoint = None
if first:
eventpoint = {'status': 'start', 'text': text, 'msgevent': textevent}
first = False
self.parent.put_audio_frame(stream[idx:idx + self.chunk], eventpoint)
streamlen -= self.chunk
idx += self.chunk
eventpoint = {'status': 'end', 'text': text, 'msgevent': textevent}
self.parent.put_audio_frame(np.zeros(self.chunk, np.float32), eventpoint)
Metadata
Metadata
Assignees
Labels
No labels