Skip to content

经验分享:fish_speech OpenAudio-S1-mini 推理时候,长文本朗读有时会因为tokens超长导致文本截断的问题处理 #532

@WangSiyao666

Description

@WangSiyao666

我在作者的调用TTS 方法中做了点变更,使其可以按照中文的(。!?)进行断句,然后一句一句传给TTS。

class FishTTS(BaseTTS):
    def txt_to_audio(self, msg):
        text, textevent = msg
        # 按中文和英文标点切分成句子
        sentences = re.split(r'([。!?])', text)
        # 把分隔符拼回去,得到完整的句子列表
        combined = []
        for i in range(0, len(sentences), 2):
            s = sentences[i].strip()
            if not s:
                continue
            if i + 1 < len(sentences):
                s += sentences[i + 1]
            combined.append(s)

        # 逐句调用 fish_speech 并播放
        for sent in combined:
            audio_stream = self.fish_speech(
                sent,
                self.opt.REF_FILE,
                self.opt.REF_TEXT,
                "zh",
                self.opt.TTS_SERVER,
            )
            self.stream_tts(audio_stream, (sent, textevent))

    def fish_speech(self, text, reffile, reftext, language, server_url) -> Iterator[bytes]:
        start = time.perf_counter()
        req = {
            'text': text,
            'reference_id': reffile,
            'format': 'wav',
            'streaming': True,
            'use_memory_cache': 'on'
        }
        try:
            res = requests.post(
                f"{server_url}/v1/tts",
                json=req,
                stream=True,
                headers={
                    "content-type": "application/json",
                },
            )
            end = time.perf_counter()
            logger.info(f"fish_speech Time to make POST: {end - start}s")

            if res.status_code != 200:
                logger.error("Error:%s", res.text)
                return

            first = True

            for chunk in res.iter_content(chunk_size=17640):  # 1764 44100*20ms*2
                if first:
                    end = time.perf_counter()
                    logger.info(f"fish_speech Time to first chunk: {end - start}s")
                    first = False
                if chunk and self.state == State.RUNNING:
                    yield chunk
        except Exception as e:
            logger.exception('fishtts')

    def stream_tts(self, audio_stream, msg):
        text, textevent = msg
        first = True
        for chunk in audio_stream:
            if chunk is not None and len(chunk) > 0:
                stream = np.frombuffer(chunk, dtype=np.int16).astype(np.float32) / 32767
                stream = resampy.resample(x=stream, sr_orig=44100, sr_new=self.sample_rate)
                streamlen = stream.shape[0]
                idx = 0
                while streamlen >= self.chunk:
                    eventpoint = None
                    if first:
                        eventpoint = {'status': 'start', 'text': text, 'msgevent': textevent}
                        first = False
                    self.parent.put_audio_frame(stream[idx:idx + self.chunk], eventpoint)
                    streamlen -= self.chunk
                    idx += self.chunk
        eventpoint = {'status': 'end', 'text': text, 'msgevent': textevent}
        self.parent.put_audio_frame(np.zeros(self.chunk, np.float32), eventpoint)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions