Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

新增了文件转语音命令,修改了文件结构 #24

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
242 changes: 150 additions & 92 deletions python_cli_demo/tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,104 +8,162 @@
import re
import uuid
import argparse
import os

class MsTTS:

'''命令行参数解析'''
def parseArgs():
parser = argparse.ArgumentParser(description='text2speech')
parser.add_argument('--input', dest='input', help='SSML(语音合成标记语言)的路径', type=str, required=True)
parser.add_argument('--output', dest='output', help='保存mp3文件的路径', type=str, required=False)
args = parser.parse_args()
return args

# Fix the time to match Americanisms
def hr_cr(hr):
corrected = (hr - 1) % 24
return str(corrected)

# Add zeros in the right places i.e 22:1:5 -> 22:01:05
def fr(input_string):
corr = ''
i = 2 - len(input_string)
while (i > 0):
corr += '0'
i -= 1
return corr + input_string

# Generate X-Timestamp all correctly formatted
def getXTime():
now = datetime.now()
return fr(str(now.year)) + '-' + fr(str(now.month)) + '-' + fr(str(now.day)) + 'T' + fr(hr_cr(int(now.hour))) + ':' + fr(str(now.minute)) + ':' + fr(str(now.second)) + '.' + str(now.microsecond)[:3] + 'Z'

# Async function for actually communicating with the websocket
async def transferMsTTSData(SSML_text, outputPath):
# endpoint1 = "https://azure.microsoft.com/en-gb/services/cognitive-services/text-to-speech/"
# r = requests.get(endpoint1)
# main_web_content = r.text
# # They hid the Auth key assignment for the websocket in the main body of the webpage....
# token_expr = re.compile('token: \"(.*?)\"', re.DOTALL)
# Auth_Token = re.findall(token_expr, main_web_content)[0]
# req_id = str('%032x' % random.getrandbits(128)).upper()
# req_id is generated by uuid.
req_id = uuid.uuid4().hex.upper()
print(req_id)
# wss://eastus.api.speech.microsoft.com/cognitiveservices/websocket/v1?TrafficType=AzureDemo&Authorization=bearer%20undefined&X-ConnectionId=577D1E595EEB45979BA26C056A519073
# endpoint2 = "wss://eastus.tts.speech.microsoft.com/cognitiveservices/websocket/v1?Authorization=" + \
# Auth_Token + "&X-ConnectionId=" + req_id
# 目前该接口没有认证可能很快失效
endpoint2 = f"wss://eastus.api.speech.microsoft.com/cognitiveservices/websocket/v1?TrafficType=AzureDemo&Authorization=bearer%20undefined&X-ConnectionId={req_id}"
async with websockets.connect(endpoint2,extra_headers={'Origin':'https://azure.microsoft.com'}) as websocket:
payload_1 = '{"context":{"system":{"name":"SpeechSDK","version":"1.12.1-rc.1","build":"JavaScript","lang":"JavaScript","os":{"platform":"Browser/Linux x86_64","name":"Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0","version":"5.0 (X11)"}}}}'
message_1 = 'Path : speech.config\r\nX-RequestId: ' + req_id + '\r\nX-Timestamp: ' + \
getXTime() + '\r\nContent-Type: application/json\r\n\r\n' + payload_1
await websocket.send(message_1)

payload_2 = '{"synthesis":{"audio":{"metadataOptions":{"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":false},"outputFormat":"audio-16khz-32kbitrate-mono-mp3"}}}'
message_2 = 'Path : synthesis.context\r\nX-RequestId: ' + req_id + '\r\nX-Timestamp: ' + \
getXTime() + '\r\nContent-Type: application/json\r\n\r\n' + payload_2
await websocket.send(message_2)

# payload_3 = '<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="' + voice + '"><mstts:express-as style="General"><prosody rate="'+spd+'%" pitch="'+ptc+'%">'+ msg_content +'</prosody></mstts:express-as></voice></speak>'
payload_3 = SSML_text
message_3 = 'Path: ssml\r\nX-RequestId: ' + req_id + '\r\nX-Timestamp: ' + \
getXTime() + '\r\nContent-Type: application/ssml+xml\r\n\r\n' + payload_3
await websocket.send(message_3)

# Checks for close connection message
end_resp_pat = re.compile('Path:turn.end')
audio_stream = b''
while(True):
response = await websocket.recv()
print('receiving...')
# Make sure the message isn't telling us to stop
if (re.search(end_resp_pat, str(response)) == None):
# Check if our response is text data or the audio bytes
if type(response) == type(bytes()):
# Extract binary data
try:
needle = b'Path:audio\r\n'
start_ind = response.find(needle) + len(needle)
audio_stream += response[start_ind:]
except:
pass
else:
break
with open(f'{outputPath}.mp3', 'wb') as audio_out:
def __init__(self,SSML_text=''):
self.audio_stream = b''
self.SSML_text = SSML_text
self.__result = False
# Generate X-Timestamp all correctly formatted
def getXTime(self):
# Fix the time to match Americanisms
def hr_cr(hr):
corrected = (hr - 1) % 24
return str(corrected)

# Add zeros in the right places i.e 22:1:5 -> 22:01:05
def fr(input_string):
corr = ''
i = 2 - len(input_string)
while (i > 0):
corr += '0'
i -= 1
return corr + input_string

now = datetime.now()
return fr(str(now.year)) + '-' + fr(str(now.month)) + '-' + fr(str(now.day)) + 'T' + fr(hr_cr(int(now.hour))) + ':' + fr(str(now.minute)) + ':' + fr(str(now.second)) + '.' + str(now.microsecond)[:3] + 'Z'

# Async function for actually communicating with the websocket
async def transferMsTTSData(self):
# endpoint1 = "https://azure.microsoft.com/en-gb/services/cognitive-services/text-to-speech/"
# r = requests.get(endpoint1)
# main_web_content = r.text
# # They hid the Auth key assignment for the websocket in the main body of the webpage....
# token_expr = re.compile('token: \"(.*?)\"', re.DOTALL)
# Auth_Token = re.findall(token_expr, main_web_content)[0]
# req_id = str('%032x' % random.getrandbits(128)).upper()
# req_id is generated by uuid.
req_id = uuid.uuid4().hex.upper()
print(f" req_id:{req_id}")
# wss://eastus.api.speech.microsoft.com/cognitiveservices/websocket/v1?TrafficType=AzureDemo&Authorization=bearer%20undefined&X-ConnectionId=577D1E595EEB45979BA26C056A519073
# endpoint2 = "wss://eastus.tts.speech.microsoft.com/cognitiveservices/websocket/v1?Authorization=" + \
# Auth_Token + "&X-ConnectionId=" + req_id
# 目前该接口没有认证可能很快失效
endpoint2 = f"wss://eastus.api.speech.microsoft.com/cognitiveservices/websocket/v1?TrafficType=AzureDemo&Authorization=bearer%20undefined&X-ConnectionId={req_id}"
async with websockets.connect(endpoint2,extra_headers={'Origin':'https://azure.microsoft.com'}) as websocket:
# 创建WS link
payload_1 = '{"context":{"system":{"name":"SpeechSDK","version":"1.12.1-rc.1","build":"JavaScript","lang":"JavaScript","os":{"platform":"Browser/Linux x86_64","name":"Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0","version":"5.0 (X11)"}}}}'
message_1 = 'Path : speech.config\r\nX-RequestId: ' + req_id + '\r\nX-Timestamp: ' + \
self.getXTime() + '\r\nContent-Type: application/json\r\n\r\n' + payload_1
await websocket.send(message_1)

# 发送合成语音配置,更改格式需要重新发送
payload_2 = '{"synthesis":{"audio":{"metadataOptions":{"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":false},"outputFormat":"audio-16khz-32kbitrate-mono-mp3"}}}'
message_2 = 'Path : synthesis.context\r\nX-RequestId: ' + req_id + '\r\nX-Timestamp: ' + \
self.getXTime() + '\r\nContent-Type: application/json\r\n\r\n' + payload_2
await websocket.send(message_2)

# 发送SSML文本
# payload_3 = '<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="' + voice + '"><mstts:express-as style="General"><prosody rate="'+spd+'%" pitch="'+ptc+'%">'+ msg_content +'</prosody></mstts:express-as></voice></speak>'
payload_3 = self.SSML_text
message_3 = 'Path: ssml\r\nX-RequestId: ' + req_id + '\r\nX-Timestamp: ' + \
self.getXTime() + '\r\nContent-Type: application/ssml+xml\r\n\r\n' + payload_3
await websocket.send(message_3)

# Checks for close connection message
end_resp_pat = re.compile('Path:turn.end')
while(True):
# response = await websocket.recv()
time_seconds = 5
try:
response = await asyncio.wait_for(websocket.recv(), timeout=time_seconds)
except TimeoutError as e:
self.__result = False#"response Timeout"
break
# Make sure the message isn't telling us to stop
if (re.search(end_resp_pat, str(response)) == None):
# Check if our response is text data or the audio bytes
if type(response) == type(bytes()):
# Extract binary data
try:
needle = b'Path:audio\r\n'
start_ind = response.find(needle) + len(needle)
self.audio_stream += response[start_ind:]
except:
pass
else:
self.__result = True#"response complete"
break

def getAudioStream(self):
asyncio.get_event_loop().run_until_complete(self.transferMsTTSData())
return self.__result



class TTSHandler:
def __init__(self,writeIndex=0):
args = self.parseArgs()
self.fileType = os.path.splitext(args.input)[-1]
self.output_path = args.output if args.output else 'output_'+ str(int(time.time()*1000))
self.text = self.readFile(args.input)
if self.fileType=='.xml':
ttsHandle = MsTTS(self.text)
if(ttsHandle.getAudioStream()):
self.saveDate2MP3(ttsHandle.audio_stream,self.output_path)
else:
self.textArray = self.splitext(self.text)
self.writeIndex = writeIndex
for t in self.textArray:
ttsHandle = MsTTS(self.wrapinSSML(t))
if(ttsHandle.getAudioStream()):
self.writeMode = "ab" if self.writeIndex else "wb"
self.writeIndex +=1
self.saveDate2MP3(ttsHandle.audio_stream,self.output_path,self.writeMode)
else:
print(f"fail to get Audio Stream ,writeIndex {self.writeIndex}")
break

# 命令行参数解析
def parseArgs(self):
parser = argparse.ArgumentParser(description='text2speech')
parser.add_argument('--input', dest='input', help='SSML(语音合成标记语言)的路径', type=str, required=True)
parser.add_argument('--output', dest='output', help='保存mp3文件的路径', type=str, required=False)
args = parser.parse_args()
return args

def saveDate2MP3(self,audio_stream,outputPath,writeMode = "wb"):
#append ab or write wb
with open(f'{outputPath}.mp3', writeMode) as audio_out:
audio_out.write(audio_stream)


async def mainSeq(SSML_text, outputPath):
await transferMsTTSData(SSML_text, outputPath)

def get_SSML(path):
with open(path,'r',encoding='utf-8') as f:
return f.read()
def readFile(self,path):
# utf-8-sig check if a BOM exists or not and manages that for you which behaves exactly as utf-8
with open(path,'r',encoding='utf-8-sig') as f:
return f.read()

def splitext(self,some_string):
x=500
# [0,x)[x,2x)...
res=[some_string[y-x:y] for y in range(x, len(some_string)+x,x)]
if len(res)>1 and len(res[-1])<x/2 :
res[-2] += res[-1]
res.pop(-1)
return res


def wrapinSSML(self,text):
SSML_text ='<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="' + "zh-CN-XiaoxiaoNeural" + '"><mstts:express-as style="General"><prosody rate="'+"0"+'%" pitch="'+"0"+'%">'+ text +'</prosody></mstts:express-as></voice></speak>'
return SSML_text

if __name__ == "__main__":
args = parseArgs()
SSML_text = get_SSML(args.input)
output_path = args.output if args.output else 'output_'+ str(int(time.time()*1000))
asyncio.get_event_loop().run_until_complete(mainSeq(SSML_text, output_path))
print('completed')
TTSHandler() # writeIndex 是读取音频失败时,文本array 的索引
print(' completed')

# python tts.py --input SSML.xml
# python tts.py --input SSML.xml --output 保存文件名
# python tts.py --input SSML.xml --output 保存文件名
# python tts.py --input txtFile.ext --output 保存文件名