skygongque · EECSMiner · Feb 28, 2023 · Mar 1, 2023 · Mar 1, 2023 · Mar 1, 2023
diff --git a/python_cli_demo/tts.py b/python_cli_demo/tts.py
@@ -8,104 +8,162 @@
 import re
 import uuid
 import argparse
+import os
 
+class MsTTS:
 
-'''命令行参数解析'''
-def parseArgs():
-    parser = argparse.ArgumentParser(description='text2speech')
-    parser.add_argument('--input', dest='input', help='SSML(语音合成标记语言)的路径', type=str, required=True)
-    parser.add_argument('--output', dest='output', help='保存mp3文件的路径', type=str, required=False)
-    args = parser.parse_args()
-    return args
-
-# Fix the time to match Americanisms
-def hr_cr(hr):
-    corrected = (hr - 1) % 24
-    return str(corrected)
-
-# Add zeros in the right places i.e 22:1:5 -> 22:01:05
-def fr(input_string):
-    corr = ''
-    i = 2 - len(input_string)
-    while (i > 0):
-        corr += '0'
-        i -= 1
-    return corr + input_string
-
-# Generate X-Timestamp all correctly formatted
-def getXTime():
-    now = datetime.now()
-    return fr(str(now.year)) + '-' + fr(str(now.month)) + '-' + fr(str(now.day)) + 'T' + fr(hr_cr(int(now.hour))) + ':' + fr(str(now.minute)) + ':' + fr(str(now.second)) + '.' + str(now.microsecond)[:3] + 'Z'
-
-# Async function for actually communicating with the websocket
-async def transferMsTTSData(SSML_text, outputPath):
-    # endpoint1 = "https://azure.microsoft.com/en-gb/services/cognitive-services/text-to-speech/"
-    # r = requests.get(endpoint1)
-    # main_web_content = r.text
-    # # They hid the Auth key assignment for the websocket in the main body of the webpage....
-    # token_expr = re.compile('token: \"(.*?)\"', re.DOTALL)
-    # Auth_Token = re.findall(token_expr, main_web_content)[0]
-    # req_id = str('%032x' % random.getrandbits(128)).upper()
-    # req_id is generated by uuid.
-    req_id = uuid.uuid4().hex.upper()
-    print(req_id)
-    # wss://eastus.api.speech.microsoft.com/cognitiveservices/websocket/v1?TrafficType=AzureDemo&Authorization=bearer%20undefined&X-ConnectionId=577D1E595EEB45979BA26C056A519073
-    # endpoint2 = "wss://eastus.tts.speech.microsoft.com/cognitiveservices/websocket/v1?Authorization=" + \
-    #     Auth_Token + "&X-ConnectionId=" + req_id
-    # 目前该接口没有认证可能很快失效
-    endpoint2 = f"wss://eastus.api.speech.microsoft.com/cognitiveservices/websocket/v1?TrafficType=AzureDemo&Authorization=bearer%20undefined&X-ConnectionId={req_id}"
-    async with websockets.connect(endpoint2,extra_headers={'Origin':'https://azure.microsoft.com'}) as websocket:
-        payload_1 = '{"context":{"system":{"name":"SpeechSDK","version":"1.12.1-rc.1","build":"JavaScript","lang":"JavaScript","os":{"platform":"Browser/Linux x86_64","name":"Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0","version":"5.0 (X11)"}}}}'
-        message_1 = 'Path : speech.config\r\nX-RequestId: ' + req_id + '\r\nX-Timestamp: ' + \
-            getXTime() + '\r\nContent-Type: application/json\r\n\r\n' + payload_1
-        await websocket.send(message_1)
-
-        payload_2 = '{"synthesis":{"audio":{"metadataOptions":{"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":false},"outputFormat":"audio-16khz-32kbitrate-mono-mp3"}}}'
-        message_2 = 'Path : synthesis.context\r\nX-RequestId: ' + req_id + '\r\nX-Timestamp: ' + \
-            getXTime() + '\r\nContent-Type: application/json\r\n\r\n' + payload_2
-        await websocket.send(message_2)
-
-        # payload_3 = '<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="' + voice + '"><mstts:express-as style="General"><prosody rate="'+spd+'%" pitch="'+ptc+'%">'+ msg_content +'</prosody></mstts:express-as></voice></speak>'
-        payload_3 = SSML_text
-        message_3 = 'Path: ssml\r\nX-RequestId: ' + req_id + '\r\nX-Timestamp: ' + \
-            getXTime() + '\r\nContent-Type: application/ssml+xml\r\n\r\n' + payload_3
-        await websocket.send(message_3)
-
-        # Checks for close connection message
-        end_resp_pat = re.compile('Path:turn.end')
-        audio_stream = b''
-        while(True):
-            response = await websocket.recv()
-            print('receiving...')
-            # Make sure the message isn't telling us to stop
-            if (re.search(end_resp_pat, str(response)) == None):
-                # Check if our response is text data or the audio bytes
-                if type(response) == type(bytes()):
-                    # Extract binary data
-                    try:
-                        needle = b'Path:audio\r\n'
-                        start_ind = response.find(needle) + len(needle)
-                        audio_stream += response[start_ind:]
-                    except:
-                        pass
-            else:
-                break
-        with open(f'{outputPath}.mp3', 'wb') as audio_out:
+    def __init__(self,SSML_text=''):
+        self.audio_stream = b''
+        self.SSML_text = SSML_text 
+        self.__result = False
+    # Generate X-Timestamp all correctly formatted
+    def getXTime(self):
+        # Fix the time to match Americanisms
+        def hr_cr(hr):
+            corrected = (hr - 1) % 24
+            return str(corrected)
+
+        # Add zeros in the right places i.e 22:1:5 -> 22:01:05
+        def fr(input_string):
+            corr = ''
+            i = 2 - len(input_string)
+            while (i > 0):
+                corr += '0'
+                i -= 1
+            return corr + input_string
+
+        now = datetime.now()
+        return fr(str(now.year)) + '-' + fr(str(now.month)) + '-' + fr(str(now.day)) + 'T' + fr(hr_cr(int(now.hour))) + ':' + fr(str(now.minute)) + ':' + fr(str(now.second)) + '.' + str(now.microsecond)[:3] + 'Z'
+
+    # Async function for actually communicating with the websocket
+    async def transferMsTTSData(self):
+        # endpoint1 = "https://azure.microsoft.com/en-gb/services/cognitive-services/text-to-speech/"
+        # r = requests.get(endpoint1)
+        # main_web_content = r.text
+        # # They hid the Auth key assignment for the websocket in the main body of the webpage....
+        # token_expr = re.compile('token: \"(.*?)\"', re.DOTALL)
+        # Auth_Token = re.findall(token_expr, main_web_content)[0]
+        # req_id = str('%032x' % random.getrandbits(128)).upper()
+        # req_id is generated by uuid.
+        req_id = uuid.uuid4().hex.upper()
+        print(f" req_id:{req_id}")
+        # wss://eastus.api.speech.microsoft.com/cognitiveservices/websocket/v1?TrafficType=AzureDemo&Authorization=bearer%20undefined&X-ConnectionId=577D1E595EEB45979BA26C056A519073
+        # endpoint2 = "wss://eastus.tts.speech.microsoft.com/cognitiveservices/websocket/v1?Authorization=" + \
+        #     Auth_Token + "&X-ConnectionId=" + req_id
+        # 目前该接口没有认证可能很快失效
+        endpoint2 = f"wss://eastus.api.speech.microsoft.com/cognitiveservices/websocket/v1?TrafficType=AzureDemo&Authorization=bearer%20undefined&X-ConnectionId={req_id}"
+        async with websockets.connect(endpoint2,extra_headers={'Origin':'https://azure.microsoft.com'}) as websocket:
+            # 创建WS link
+            payload_1 = '{"context":{"system":{"name":"SpeechSDK","version":"1.12.1-rc.1","build":"JavaScript","lang":"JavaScript","os":{"platform":"Browser/Linux x86_64","name":"Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0","version":"5.0 (X11)"}}}}'
+            message_1 = 'Path : speech.config\r\nX-RequestId: ' + req_id + '\r\nX-Timestamp: ' + \
+                self.getXTime() + '\r\nContent-Type: application/json\r\n\r\n' + payload_1
+            await websocket.send(message_1)
+
+            # 发送合成语音配置,更改格式需要重新发送
+            payload_2 = '{"synthesis":{"audio":{"metadataOptions":{"sentenceBoundaryEnabled":false,"wordBoundaryEnabled":false},"outputFormat":"audio-16khz-32kbitrate-mono-mp3"}}}'
+            message_2 = 'Path : synthesis.context\r\nX-RequestId: ' + req_id + '\r\nX-Timestamp: ' + \
+                self.getXTime() + '\r\nContent-Type: application/json\r\n\r\n' + payload_2
+            await websocket.send(message_2)
+
+            # 发送SSML文本
+            # payload_3 = '<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="' + voice + '"><mstts:express-as style="General"><prosody rate="'+spd+'%" pitch="'+ptc+'%">'+ msg_content +'</prosody></mstts:express-as></voice></speak>'
+            payload_3 = self.SSML_text
+            message_3 = 'Path: ssml\r\nX-RequestId: ' + req_id + '\r\nX-Timestamp: ' + \
+                self.getXTime() + '\r\nContent-Type: application/ssml+xml\r\n\r\n' + payload_3
+            await websocket.send(message_3)
+
+            # Checks for close connection message
+            end_resp_pat = re.compile('Path:turn.end')
+            while(True):
+                # response = await websocket.recv()
+                time_seconds = 5
+                try:
+                    response = await asyncio.wait_for(websocket.recv(), timeout=time_seconds)
+                except TimeoutError as e:
+                    self.__result = False#"response Timeout"
+                    break
+                # Make sure the message isn't telling us to stop
+                if (re.search(end_resp_pat, str(response)) == None):
+                    # Check if our response is text data or the audio bytes
+                    if type(response) == type(bytes()):
+                        # Extract binary data
+                        try:
+                            needle = b'Path:audio\r\n'
+                            start_ind = response.find(needle) + len(needle)
+                            self.audio_stream += response[start_ind:]
+                        except:
+                            pass
+                else:
+                    self.__result = True#"response complete"
+                    break
+
+    def getAudioStream(self):
+        asyncio.get_event_loop().run_until_complete(self.transferMsTTSData())
+        return self.__result
+
+
+
+class TTSHandler:
+    def __init__(self,writeIndex=0):
+        args = self.parseArgs()
+        self.fileType = os.path.splitext(args.input)[-1]
+        self.output_path = args.output if args.output else 'output_'+ str(int(time.time()*1000))
+        self.text = self.readFile(args.input)
+        if self.fileType=='.xml':
+            ttsHandle = MsTTS(self.text)
+            if(ttsHandle.getAudioStream()):
+                self.saveDate2MP3(ttsHandle.audio_stream,self.output_path)
+        else:
+            self.textArray = self.splitext(self.text)
+            self.writeIndex = writeIndex
+            for t in self.textArray:
+                ttsHandle = MsTTS(self.wrapinSSML(t)) 
+                if(ttsHandle.getAudioStream()):
+                    self.writeMode = "ab" if self.writeIndex else "wb"
+                    self.writeIndex +=1
+                    self.saveDate2MP3(ttsHandle.audio_stream,self.output_path,self.writeMode)
+                else:
+                    print(f"fail to get Audio Stream ,writeIndex {self.writeIndex}")
+                    break
+
+    # 命令行参数解析
+    def parseArgs(self):
+        parser = argparse.ArgumentParser(description='text2speech')
+        parser.add_argument('--input', dest='input', help='SSML(语音合成标记语言)的路径', type=str, required=True)
+        parser.add_argument('--output', dest='output', help='保存mp3文件的路径', type=str, required=False)
+        args = parser.parse_args()
+        return args
+
+    def saveDate2MP3(self,audio_stream,outputPath,writeMode = "wb"):
+        #append ab or write wb
+        with open(f'{outputPath}.mp3', writeMode) as audio_out:
             audio_out.write(audio_stream)
 
 
-async def mainSeq(SSML_text, outputPath):
-    await transferMsTTSData(SSML_text, outputPath)
 
-def get_SSML(path):
-    with open(path,'r',encoding='utf-8') as f:
-        return f.read()
+    def readFile(self,path):
+        # utf-8-sig check if a BOM exists or not and manages that for you which behaves exactly as utf-8
+        with open(path,'r',encoding='utf-8-sig') as f:
+            return f.read()
+
+    def splitext(self,some_string):
+        x=500 
+        # [0,x)[x,2x)...
+        res=[some_string[y-x:y] for y in range(x, len(some_string)+x,x)]
+        if len(res)>1 and len(res[-1])<x/2 :
+            res[-2] += res[-1]
+            res.pop(-1)
+        return res
+
+
+    def wrapinSSML(self,text):
+        SSML_text ='<speak xmlns="http://www.w3.org/2001/10/synthesis" xmlns:mstts="http://www.w3.org/2001/mstts" xmlns:emo="http://www.w3.org/2009/10/emotionml" version="1.0" xml:lang="en-US"><voice name="' + "zh-CN-XiaoxiaoNeural" + '"><mstts:express-as style="General"><prosody rate="'+"0"+'%" pitch="'+"0"+'%">'+ text +'</prosody></mstts:express-as></voice></speak>'
+        return SSML_text
 
 if __name__ == "__main__":
-    args = parseArgs()
-    SSML_text = get_SSML(args.input)
-    output_path = args.output if args.output else 'output_'+ str(int(time.time()*1000))
-    asyncio.get_event_loop().run_until_complete(mainSeq(SSML_text, output_path))
-    print('completed')
+    TTSHandler() # writeIndex 是读取音频失败时，文本array 的索引
+    print(' completed')
+
     # python tts.py --input SSML.xml
-    # python tts.py --input SSML.xml --output 保存文件名
+    # python tts.py --input SSML.xml --output 保存文件名
+    # python tts.py --input txtFile.ext --output 保存文件名