@@ -9,6 +9,7 @@ import {Agent} from "http";
9
9
import { PITCH } from "./PITCH" ;
10
10
import { RATE } from "./RATE" ;
11
11
import { VOLUME } from "./VOLUME" ;
12
+ import { getHeadersAndData , parseMetadata } from "./utils" ;
12
13
13
14
export type Voice = {
14
15
Name : string ;
@@ -42,6 +43,7 @@ export class ProsodyOptions {
42
43
}
43
44
44
45
export class MsEdgeTTS {
46
+ static wordBoundaryEnabled = true ;
45
47
static OUTPUT_FORMAT = OUTPUT_FORMAT ;
46
48
private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4" ;
47
49
private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${ MsEdgeTTS . TRUSTED_CLIENT_TOKEN } ` ;
@@ -57,9 +59,15 @@ export class MsEdgeTTS {
57
59
private _streams : { [ key : string ] : Readable } = { } ;
58
60
private _startTime = 0 ;
59
61
private readonly _agent : Agent ;
62
+ private _arraybuffer : boolean = false ;
63
+ private state = {
64
+ offsetCompensation : 0 ,
65
+ lastDurationOffset : 0
66
+ } ;
60
67
61
68
private _log ( ...o : any [ ] ) {
62
69
if ( this . _enableLogger ) {
70
+ o . unshift ( 'edgetts:' ) ;
63
71
console . log ( ...o )
64
72
}
65
73
}
@@ -85,7 +93,7 @@ export class MsEdgeTTS {
85
93
await this . _initClient ( ) ;
86
94
}
87
95
this . _ws . send ( message , ( ) => {
88
- this . _log ( "<- sent message: " , message ) ;
96
+ // this._log("<- sent message: ", message);
89
97
} ) ;
90
98
}
91
99
@@ -94,7 +102,7 @@ export class MsEdgeTTS {
94
102
? new WebSocket ( MsEdgeTTS . SYNTH_URL )
95
103
: new WebSocket ( MsEdgeTTS . SYNTH_URL , { agent : this . _agent } ) ;
96
104
97
- this . _ws . binaryType = "arraybuffer" ;
105
+ if ( this . _arraybuffer ) this . _ws . binaryType = "arraybuffer" ;
98
106
return new Promise ( ( resolve , reject ) => {
99
107
this . _ws . onopen = ( ) => {
100
108
this . _log ( "Connected in" , ( Date . now ( ) - this . _startTime ) / 1000 , "seconds" )
@@ -105,7 +113,7 @@ export class MsEdgeTTS {
105
113
"audio": {
106
114
"metadataoptions": {
107
115
"sentenceBoundaryEnabled": "false",
108
- "wordBoundaryEnabled": "false "
116
+ "wordBoundaryEnabled": "${ MsEdgeTTS . wordBoundaryEnabled } "
109
117
},
110
118
"outputFormat": "${ this . _outputFormat } "
111
119
}
@@ -115,22 +123,83 @@ export class MsEdgeTTS {
115
123
` ) . then ( resolve ) ;
116
124
} ;
117
125
this . _ws . onmessage = ( m ) => {
118
- const buffer = Buffer . from ( m . data as ArrayBuffer ) ;
119
- const message = buffer . toString ( )
120
- const requestId = / X - R e q u e s t I d : ( .* ?) \r \n / gm. exec ( message ) [ 1 ] ;
121
- if ( message . includes ( "Path:turn.start" ) ) {
122
- // start of turn, ignore
123
- } else if ( message . includes ( "Path:turn.end" ) ) {
124
- // end of turn, close stream
125
- this . _streams [ requestId ] . push ( null ) ;
126
- } else if ( message . includes ( "Path:response" ) ) {
127
- // context response, ignore
128
- } else if ( message . includes ( "Path:audio" ) && m . data instanceof ArrayBuffer ) {
129
- this . _pushAudioData ( buffer , requestId )
126
+ this . _log ( "type:::::::: " , typeof m . data ) ;
127
+ let mdata :any = m . data ;
128
+
129
+ if ( typeof mdata === 'string' ) {
130
+ const encodedData = Buffer . from ( mdata , 'utf8' ) ;
131
+ const message = mdata ;
132
+ const requestId = / X - R e q u e s t I d : ( .* ?) \r \n / gm. exec ( message ) [ 1 ] ;
133
+ let [ headers , data ] = getHeadersAndData ( encodedData , encodedData . indexOf ( "\r\n\r\n" ) ) ;
134
+ const path = headers [ 'Path' ] ;
135
+ if ( path === "audio.metadata" ) {
136
+ let parsedMetadata = parseMetadata ( data , this . state [ "offsetCompensation" ] ) ;
137
+ this . _pushData ( parsedMetadata , requestId ) ;
138
+ // 更新上一次的持续时间偏移量,用于下一次 SSML 请求
139
+ this . state [ "lastDurationOffset" ] = parsedMetadata [ "offset" ] + parsedMetadata [ "duration" ] ;
140
+ } else if ( path === "turn.end" ) {
141
+ this . state [ "offsetCompensation" ] = this . state [ "lastDurationOffset" ] ;
142
+ this . state [ "offsetCompensation" ] += 8750000 ;
143
+ } else if ( path !== "response" && path !== "turn.start" ) {
144
+ // 如果路径不是 "response" 或 "turn.start"
145
+ throw new Error ( "Unknown path received" ) ; // 抛出未知响应错误
146
+ }
147
+ } else if ( Buffer . isBuffer ( mdata ) ) {
148
+ const message = mdata . toString ( )
149
+ const requestId = / X - R e q u e s t I d : ( .* ?) \r \n / gm. exec ( message ) [ 1 ] ;
150
+ const headerLength = mdata . readUInt16BE ( 0 ) ;
151
+ if ( headerLength > mdata . length ) {
152
+ throw new Error ( "The header length is greater than the length of the data." ) ;
153
+ }
154
+
155
+ // Parse the headers and data from the binary message.
156
+ let [ headers , data ] = getHeadersAndData ( mdata , headerLength ) ;
157
+ if ( headers [ 'Path' ] !== 'audio' ) {
158
+ throw new Error ( "Received binary message, but the path is not audio." ) ;
159
+ }
160
+ const contentType = headers [ 'Content-Type' ] ;
161
+ if ( contentType !== 'audio/mpeg' && contentType !== undefined ) {
162
+ throw new Error ( "Received binary message, but with an unexpected Content-Type." ) ;
163
+ }
164
+
165
+ // We only allow no Content-Type if there is no data.
166
+ if ( contentType === undefined ) {
167
+ if ( data . length === 0 ) {
168
+ return ;
169
+ }
170
+
171
+ // If the data is not empty, then we need to raise an exception.
172
+ throw new Error ( "Received binary message with no Content-Type, but with data." ) ;
173
+ }
174
+
175
+ // If the data is empty now, then we need to raise an exception.
176
+ if ( data . length === 0 ) {
177
+ throw new Error ( "Received binary message, but it is missing the audio data." ) ;
178
+ }
179
+
180
+ this . _pushData ( { type : "audio" , data : data } , requestId ) ;
130
181
} else {
131
- this . _log ( "UNKNOWN MESSAGE" , message ) ;
182
+ mdata = Buffer . isBuffer ( mdata ) ? mdata : mdata [ 'data' ] ;
183
+ const buffer = Buffer . from ( mdata ) ;
184
+ const message = buffer . toString ( )
185
+ const requestId = / X - R e q u e s t I d : ( .* ?) \r \n / gm. exec ( message ) [ 1 ] ;
186
+ this . _log ( message . includes ( "Path:audio" ) , Buffer . isBuffer ( mdata ) , mdata instanceof ArrayBuffer ) ;
187
+
188
+ if ( message . includes ( "Path:turn.start" ) ) {
189
+ // start of turn, ignore
190
+ } else if ( message . includes ( "Path:turn.end" ) ) {
191
+ // end of turn, close stream
192
+ this . _streams [ requestId ] . push ( null ) ;
193
+ } else if ( message . includes ( "Path:response" ) ) {
194
+ // context response, ignore
195
+ } else if ( message . includes ( "Path:audio" ) && Buffer . isBuffer ( mdata ) ) {
196
+ this . _pushAudioData ( buffer , requestId )
197
+ } else {
198
+ //this._log("UNKNOWN MESSAGE", message);
199
+ }
132
200
}
133
201
}
202
+
134
203
this . _ws . onclose = ( ) => {
135
204
this . _log ( "disconnected after:" , ( Date . now ( ) - this . _startTime ) / 1000 , "seconds" )
136
205
for ( const requestId in this . _streams ) {
@@ -143,11 +212,16 @@ export class MsEdgeTTS {
143
212
} ) ;
144
213
}
145
214
215
+ private _pushData ( data : any , requestId : string ) {
216
+ data = typeof data == "string" ? data : JSON . stringify ( data ) ;
217
+ this . _streams [ requestId ] . push ( data , 'utf8' ) ;
218
+ }
219
+
146
220
private _pushAudioData ( audioBuffer : Buffer , requestId : string ) {
147
221
const audioStartIndex = audioBuffer . indexOf ( MsEdgeTTS . BINARY_DELIM ) + MsEdgeTTS . BINARY_DELIM . length ;
148
222
const audioData = audioBuffer . subarray ( audioStartIndex ) ;
149
223
this . _streams [ requestId ] . push ( audioData ) ;
150
- this . _log ( "received audio chunk, size: " , audioData ?. length )
224
+ this . _log ( "_pushAudioData: received audio chunk, size: " , audioData ?. length )
151
225
}
152
226
153
227
private _SSMLTemplate ( input : string , options : ProsodyOptions = { } ) : string {
@@ -162,10 +236,6 @@ export class MsEdgeTTS {
162
236
</speak>` ;
163
237
}
164
238
165
- /**
166
- * Fetch the list of voices available in Microsoft Edge.
167
- * These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview).
168
- */
169
239
getVoices ( ) : Promise < Voice [ ] > {
170
240
return new Promise ( ( resolve , reject ) => {
171
241
axios . get ( MsEdgeTTS . VOICES_URL )
@@ -174,15 +244,10 @@ export class MsEdgeTTS {
174
244
} ) ;
175
245
}
176
246
177
- /**
178
- * Sets the required information for the speech to be synthesised and inits a new WebSocket connection.
179
- * Must be called at least once before text can be synthesised.
180
- * Saved in this instance. Can be called at any time times to update the metadata.
181
- *
182
- * @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
183
- * @param outputFormat any {@link OUTPUT_FORMAT}
184
- * @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
185
- */
247
+ setConfig ( conf :any ) {
248
+ this . _arraybuffer = conf [ "arraybuffer" ] ?? false ;
249
+ }
250
+
186
251
async setMetadata ( voiceName : string , outputFormat : OUTPUT_FORMAT , voiceLocale ?: string ) {
187
252
const oldVoice = this . _voice ;
188
253
const oldVoiceLocale = this . _voiceLocale ;
@@ -213,54 +278,23 @@ export class MsEdgeTTS {
213
278
"Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile." ) ;
214
279
}
215
280
216
- /**
217
- * Close the WebSocket connection.
218
- */
219
281
close ( ) {
220
282
this . _ws . close ( ) ;
221
283
}
222
284
223
- /**
224
- * Writes raw audio synthesised from text to a file. Uses a basic {@link _SSMLTemplate SML template}.
225
- *
226
- * @param path a valid output path, including a filename and file extension.
227
- * @param input the input to synthesise
228
- * @param options (optional) {@link ProsodyOptions}
229
- * @returns {Promise<string> } - a `Promise` with the full filepath
230
- */
231
285
toFile ( path : string , input : string , options ?: ProsodyOptions ) : Promise < string > {
232
286
return this . _rawSSMLRequestToFile ( path , this . _SSMLTemplate ( input , options ) ) ;
233
287
}
234
288
235
- /**
236
- * Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}.
237
- *
238
- * @param input the text to synthesise. Can include SSML elements.
239
- * @param options (optional) {@link ProsodyOptions}
240
- * @returns {Readable } - a `stream.Readable` with the audio data
241
- */
242
289
toStream ( input : string , options ?: ProsodyOptions ) : Readable {
243
290
const { stream} = this . _rawSSMLRequest ( this . _SSMLTemplate ( input , options ) ) ;
244
291
return stream ;
245
292
}
246
293
247
- /**
248
- * Writes raw audio synthesised from text to a file. Has no SSML template. Basic SSML should be provided in the request.
249
- *
250
- * @param path a valid output path, including a filename and file extension.
251
- * @param requestSSML the SSML to send. SSML elements required in order to work.
252
- * @returns {Promise<string> } - a `Promise` with the full filepath
253
- */
254
294
rawToFile ( path : string , requestSSML : string ) : Promise < string > {
255
295
return this . _rawSSMLRequestToFile ( path , requestSSML ) ;
256
296
}
257
297
258
- /**
259
- * Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request.
260
- *
261
- * @param requestSSML the SSML to send. SSML elements required in order to work.
262
- * @returns {Readable } - a `stream.Readable` with the audio data
263
- */
264
298
rawToStream ( requestSSML : string ) : Readable {
265
299
const { stream} = this . _rawSSMLRequest ( requestSSML ) ;
266
300
return stream ;
@@ -300,6 +334,7 @@ export class MsEdgeTTS {
300
334
read ( ) {
301
335
} ,
302
336
destroy ( error : Error | null , callback : ( error : ( Error | null ) ) => void ) {
337
+ self . _log ( "+_+_+_+__+_" , error ) ;
303
338
delete self . _streams [ requestId ] ;
304
339
callback ( error ) ;
305
340
} ,
0 commit comments