Skip to content

Commit

Permalink
Merge pull request #379 from boji123/bj_dev_stream_fix
Browse files Browse the repository at this point in the history
[debug] fix badcase, add fade on speech output
  • Loading branch information
aluminumbox authored Sep 19, 2024
2 parents f6b5c42 + 9e0b99e commit cd26f11
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 3 deletions.
10 changes: 9 additions & 1 deletion cosyvoice/cli/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__(self,
self.llm_end_dict = {}
self.mel_overlap_dict = {}
self.hift_cache_dict = {}
self.speech_window = np.hamming(2 * self.source_cache_len)

def load(self, llm_model, flow_model, hift_model):
self.llm.load_state_dict(torch.load(llm_model, map_location=self.device))
Expand Down Expand Up @@ -114,13 +115,20 @@ def token2wav(self, token, prompt_token, prompt_feat, embedding, uuid, finalize=
self.mel_overlap_dict[uuid] = tts_mel[:, :, -self.mel_overlap_len:]
tts_mel = tts_mel[:, :, :-self.mel_overlap_len]
tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
self.hift_cache_dict[uuid] = {'source': tts_source[:, :, -self.source_cache_len:], 'mel': tts_mel[:, :, -self.mel_cache_len:]}
if self.hift_cache_dict[uuid] is not None:
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
self.hift_cache_dict[uuid] = {
'mel': tts_mel[:, :, -self.mel_cache_len:],
'source': tts_source[:, :, -self.source_cache_len:],
'speech': tts_speech[:, -self.source_cache_len:]}
tts_speech = tts_speech[:, :-self.source_cache_len]
else:
if speed != 1.0:
assert self.hift_cache_dict[uuid] is None, 'speed change only support non-stream inference mode'
tts_mel = F.interpolate(tts_mel, size=int(tts_mel.shape[2] / speed), mode='linear')
tts_speech, tts_source = self.hift.inference(mel=tts_mel, cache_source=hift_cache_source)
if self.hift_cache_dict[uuid] is not None:
tts_speech = fade_in_out(tts_speech, self.hift_cache_dict[uuid]['speech'], self.speech_window)
return tts_speech

def inference(self, text, flow_embedding, llm_embedding=torch.zeros(0, 192),
Expand Down
5 changes: 3 additions & 2 deletions cosyvoice/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def fade_in_out(fade_in_mel, fade_out_mel, window):
device = fade_in_mel.device
fade_in_mel, fade_out_mel = fade_in_mel.cpu(), fade_out_mel.cpu()
mel_overlap_len = int(window.shape[0] / 2)
fade_in_mel[:, :, :mel_overlap_len] = fade_in_mel[:, :, :mel_overlap_len] * window[:mel_overlap_len] + \
fade_out_mel[:, :, -mel_overlap_len:] * window[mel_overlap_len:]

fade_in_mel[..., :mel_overlap_len] = fade_in_mel[..., :mel_overlap_len] * window[:mel_overlap_len] + \
fade_out_mel[..., -mel_overlap_len:] * window[mel_overlap_len:]
return fade_in_mel.to(device)

0 comments on commit cd26f11

Please sign in to comment.