Skip to content

Commit 1128e73

Browse files
committed
feat: add wordBoundaryEnabled
1 parent c291c89 commit 1128e73

File tree

4 files changed

+195
-85
lines changed

4 files changed

+195
-85
lines changed

README.md

+42-20
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
1-
# MsEdgeTTS
2-
[![npm version](https://badge.fury.io/js/msedge-tts.svg)](https://badge.fury.io/js/msedge-tts)
1+
# edge-tts-node
2+
3+
[![npm version](https://badge.fury.io/js/edge-tts-node.svg)](https://badge.fury.io/js/edge-tts-node)
34

45
An simple Azure Speech Service module that uses the Microsoft Edge Read Aloud API.
56

@@ -28,55 +29,76 @@ Use a library like [xml-escape](https://www.npmjs.com/package/xml-escape).
2829
### Write to stream
2930

3031
```js
31-
import {MsEdgeTTS, OUTPUT_FORMAT} from "msedge-tts";
32+
import { MsEdgeTTS, OUTPUT_FORMAT } from "edge-tts-node";
3233

3334
const tts = new MsEdgeTTS();
34-
await tts.setMetadata("en-IE-ConnorNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS);
35+
await tts.setMetadata(
36+
"en-IE-ConnorNeural",
37+
OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS
38+
);
3539
const readable = tts.toStream("Hi, how are you?");
3640

3741
readable.on("data", (data) => {
38-
console.log("DATA RECEIVED", data);
39-
// raw audio file data
42+
console.log("DATA RECEIVED", data);
43+
// raw audio file data
4044
});
4145

4246
readable.on("close", () => {
43-
console.log("STREAM CLOSED");
47+
console.log("STREAM CLOSED");
4448
});
4549
```
4650

4751
### Write to file
4852

4953
```js
50-
import {MsEdgeTTS, OUTPUT_FORMAT} from "msedge-tts";
54+
import { MsEdgeTTS, OUTPUT_FORMAT } from "edge-tts-node";
5155

5256
(async () => {
53-
const tts = new MsEdgeTTS();
54-
await tts.setMetadata("en-US-AriaNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS);
55-
const filePath = await tts.toFile("./example_audio.webm", "Hi, how are you?");
57+
const tts = new MsEdgeTTS();
58+
await tts.setMetadata(
59+
"en-US-AriaNeural",
60+
OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS
61+
);
62+
const filePath = await tts.toFile("./example_audio.webm", "Hi, how are you?");
5663
})();
5764
```
5865

5966
### Change voice rate, pitch and volume
67+
6068
```js
61-
import {MsEdgeTTS, OUTPUT_FORMAT} from "msedge-tts";
69+
import { MsEdgeTTS, OUTPUT_FORMAT } from "edge-tts-node";
6270

6371
(async () => {
64-
const tts = new MsEdgeTTS();
65-
await tts.setMetadata("en-US-AriaNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS);
66-
const filePath = await tts.toFile("./example_audio.webm", "Hi, how are you?", {rate: 0.5, pitch: "+200Hz"});
72+
const tts = new MsEdgeTTS();
73+
await tts.setMetadata(
74+
"en-US-AriaNeural",
75+
OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS
76+
);
77+
const filePath = await tts.toFile(
78+
"./example_audio.webm",
79+
"Hi, how are you?",
80+
{ rate: 0.5, pitch: "+200Hz" }
81+
);
6782
})();
6883
```
6984

7085
### Use an alternative HTTP Agent
86+
7187
Use a custom http.Agent implementation like [https-proxy-agent](https://github.com/TooTallNate/proxy-agents) or [socks-proxy-agent](https://github.com/TooTallNate/proxy-agents/tree/main/packages/socks-proxy-agent).
88+
7289
```js
73-
import {SocksProxyAgent} from 'socks-proxy-agent';
90+
import { SocksProxyAgent } from "socks-proxy-agent";
7491

7592
(async () => {
76-
const agent = new SocksProxyAgent("socks://your-name%40gmail.com:[email protected]")
77-
const tts = new MsEdgeTTS(agent);
78-
await tts.setMetadata("en-US-AriaNeural", OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS);
79-
const filePath = await tts.toFile("./example_audio.webm", "Hi, how are you?");
93+
const agent = new SocksProxyAgent(
94+
"socks://your-name%40gmail.com:[email protected]"
95+
);
96+
const tts = new MsEdgeTTS(agent);
97+
await tts.setMetadata(
98+
"en-US-AriaNeural",
99+
OUTPUT_FORMAT.WEBM_24KHZ_16BIT_MONO_OPUS
100+
);
101+
const filePath = await tts.toFile("./example_audio.webm", "Hi, how are you?");
80102
})();
81103
```
82104

package.json

+2-4
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
2-
"name": "msedge-tts",
3-
"version": "1.3.4",
2+
"name": "edge-tts-node",
3+
"version": "1.2.2",
44
"description": "An Azure Speech Service module that uses the Microsoft Edge Read Aloud API.",
55
"author": "Migushthe2nd <[email protected]>",
66
"license": "MIT",
@@ -9,10 +9,8 @@
99
"module": "./dist/index",
1010
"main": "./dist/index",
1111
"scripts": {
12-
"preinstall": "npx only-allow pnpm",
1312
"dev": "yarn run build && ts-node src/test/test.ts",
1413
"build": "tsc",
15-
"prepublishOnly": "yarn run build",
1614
"publish": "yarn publish --access=public",
1715
"test": "jest",
1816
"test:watch": "jest --watch",

src/MsEdgeTTS.ts

+96-61
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import {Agent} from "http";
99
import {PITCH} from "./PITCH";
1010
import {RATE} from "./RATE";
1111
import {VOLUME} from "./VOLUME";
12+
import {getHeadersAndData, parseMetadata} from "./utils";
1213

1314
export type Voice = {
1415
Name: string;
@@ -42,6 +43,7 @@ export class ProsodyOptions {
4243
}
4344

4445
export class MsEdgeTTS {
46+
static wordBoundaryEnabled = true;
4547
static OUTPUT_FORMAT = OUTPUT_FORMAT;
4648
private static TRUSTED_CLIENT_TOKEN = "6A5AA1D4EAFF4E9FB37E23D68491D6F4";
4749
private static VOICES_URL = `https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=${MsEdgeTTS.TRUSTED_CLIENT_TOKEN}`;
@@ -57,9 +59,15 @@ export class MsEdgeTTS {
5759
private _streams: { [key: string]: Readable } = {};
5860
private _startTime = 0;
5961
private readonly _agent: Agent;
62+
private _arraybuffer: boolean = false;
63+
private state = {
64+
offsetCompensation: 0,
65+
lastDurationOffset: 0
66+
};
6067

6168
private _log(...o: any[]) {
6269
if (this._enableLogger) {
70+
o.unshift('edgetts:');
6371
console.log(...o)
6472
}
6573
}
@@ -85,7 +93,7 @@ export class MsEdgeTTS {
8593
await this._initClient();
8694
}
8795
this._ws.send(message, () => {
88-
this._log("<- sent message: ", message);
96+
//this._log("<- sent message: ", message);
8997
});
9098
}
9199

@@ -94,7 +102,7 @@ export class MsEdgeTTS {
94102
? new WebSocket(MsEdgeTTS.SYNTH_URL)
95103
: new WebSocket(MsEdgeTTS.SYNTH_URL, {agent: this._agent});
96104

97-
this._ws.binaryType = "arraybuffer";
105+
if (this._arraybuffer) this._ws.binaryType = "arraybuffer";
98106
return new Promise((resolve, reject) => {
99107
this._ws.onopen = () => {
100108
this._log("Connected in", (Date.now() - this._startTime) / 1000, "seconds")
@@ -105,7 +113,7 @@ export class MsEdgeTTS {
105113
"audio": {
106114
"metadataoptions": {
107115
"sentenceBoundaryEnabled": "false",
108-
"wordBoundaryEnabled": "false"
116+
"wordBoundaryEnabled": "${MsEdgeTTS.wordBoundaryEnabled}"
109117
},
110118
"outputFormat": "${this._outputFormat}"
111119
}
@@ -115,22 +123,83 @@ export class MsEdgeTTS {
115123
`).then(resolve);
116124
};
117125
this._ws.onmessage = (m) => {
118-
const buffer = Buffer.from(m.data as ArrayBuffer);
119-
const message = buffer.toString()
120-
const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)[1];
121-
if (message.includes("Path:turn.start")) {
122-
// start of turn, ignore
123-
} else if (message.includes("Path:turn.end")) {
124-
// end of turn, close stream
125-
this._streams[requestId].push(null);
126-
} else if (message.includes("Path:response")) {
127-
// context response, ignore
128-
} else if (message.includes("Path:audio") && m.data instanceof ArrayBuffer) {
129-
this._pushAudioData(buffer, requestId)
126+
this._log("type:::::::: ", typeof m.data);
127+
let mdata:any = m.data;
128+
129+
if (typeof mdata === 'string') {
130+
const encodedData = Buffer.from(mdata, 'utf8');
131+
const message = mdata;
132+
const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)[1];
133+
let [headers, data] = getHeadersAndData(encodedData, encodedData.indexOf("\r\n\r\n"));
134+
const path = headers['Path'];
135+
if (path === "audio.metadata") {
136+
let parsedMetadata = parseMetadata(data, this.state["offsetCompensation"]);
137+
this._pushData(parsedMetadata, requestId);
138+
// 更新上一次的持续时间偏移量,用于下一次 SSML 请求
139+
this.state["lastDurationOffset"] = parsedMetadata["offset"] + parsedMetadata["duration"];
140+
} else if (path === "turn.end") {
141+
this.state["offsetCompensation"] = this.state["lastDurationOffset"];
142+
this.state["offsetCompensation"] += 8750000;
143+
} else if (path !== "response" && path !== "turn.start") {
144+
// 如果路径不是 "response" 或 "turn.start"
145+
throw new Error("Unknown path received"); // 抛出未知响应错误
146+
}
147+
} else if (Buffer.isBuffer(mdata)) {
148+
const message = mdata.toString()
149+
const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)[1];
150+
const headerLength = mdata.readUInt16BE(0);
151+
if (headerLength > mdata.length) {
152+
throw new Error("The header length is greater than the length of the data.");
153+
}
154+
155+
// Parse the headers and data from the binary message.
156+
let [headers, data] = getHeadersAndData(mdata, headerLength);
157+
if (headers['Path'] !== 'audio') {
158+
throw new Error("Received binary message, but the path is not audio.");
159+
}
160+
const contentType = headers['Content-Type'];
161+
if (contentType !== 'audio/mpeg' && contentType !== undefined) {
162+
throw new Error("Received binary message, but with an unexpected Content-Type.");
163+
}
164+
165+
// We only allow no Content-Type if there is no data.
166+
if (contentType === undefined) {
167+
if (data.length === 0) {
168+
return;
169+
}
170+
171+
// If the data is not empty, then we need to raise an exception.
172+
throw new Error("Received binary message with no Content-Type, but with data.");
173+
}
174+
175+
// If the data is empty now, then we need to raise an exception.
176+
if (data.length === 0) {
177+
throw new Error("Received binary message, but it is missing the audio data.");
178+
}
179+
180+
this._pushData({ type: "audio", data: data }, requestId);
130181
} else {
131-
this._log("UNKNOWN MESSAGE", message);
182+
mdata = Buffer.isBuffer(mdata) ? mdata : mdata['data'];
183+
const buffer = Buffer.from(mdata);
184+
const message = buffer.toString()
185+
const requestId = /X-RequestId:(.*?)\r\n/gm.exec(message)[1];
186+
this._log(message.includes("Path:audio") ,Buffer.isBuffer(mdata), mdata instanceof ArrayBuffer);
187+
188+
if (message.includes("Path:turn.start")) {
189+
// start of turn, ignore
190+
} else if (message.includes("Path:turn.end")) {
191+
// end of turn, close stream
192+
this._streams[requestId].push(null);
193+
} else if (message.includes("Path:response")) {
194+
// context response, ignore
195+
} else if (message.includes("Path:audio") && Buffer.isBuffer(mdata)) {
196+
this._pushAudioData(buffer, requestId)
197+
} else {
198+
//this._log("UNKNOWN MESSAGE", message);
199+
}
132200
}
133201
}
202+
134203
this._ws.onclose = () => {
135204
this._log("disconnected after:", (Date.now() - this._startTime) / 1000, "seconds")
136205
for (const requestId in this._streams) {
@@ -143,11 +212,16 @@ export class MsEdgeTTS {
143212
});
144213
}
145214

215+
private _pushData(data: any, requestId: string) {
216+
data = typeof data == "string" ? data : JSON.stringify(data);
217+
this._streams[requestId].push(data, 'utf8');
218+
}
219+
146220
private _pushAudioData(audioBuffer: Buffer, requestId: string) {
147221
const audioStartIndex = audioBuffer.indexOf(MsEdgeTTS.BINARY_DELIM) + MsEdgeTTS.BINARY_DELIM.length;
148222
const audioData = audioBuffer.subarray(audioStartIndex);
149223
this._streams[requestId].push(audioData);
150-
this._log("received audio chunk, size: ", audioData?.length)
224+
this._log("_pushAudioData: received audio chunk, size: ", audioData?.length)
151225
}
152226

153227
private _SSMLTemplate(input: string, options: ProsodyOptions = {}): string {
@@ -162,10 +236,6 @@ export class MsEdgeTTS {
162236
</speak>`;
163237
}
164238

165-
/**
166-
* Fetch the list of voices available in Microsoft Edge.
167-
* These, however, are not all. The complete list of voices supported by this module [can be found here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support) (neural, standard, and preview).
168-
*/
169239
getVoices(): Promise<Voice[]> {
170240
return new Promise((resolve, reject) => {
171241
axios.get(MsEdgeTTS.VOICES_URL)
@@ -174,15 +244,10 @@ export class MsEdgeTTS {
174244
});
175245
}
176246

177-
/**
178-
* Sets the required information for the speech to be synthesised and inits a new WebSocket connection.
179-
* Must be called at least once before text can be synthesised.
180-
* Saved in this instance. Can be called at any time times to update the metadata.
181-
*
182-
* @param voiceName a string with any `ShortName`. A list of all available neural voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#neural-voices). However, it is not limited to neural voices: standard voices can also be used. A list of standard voices can be found [here](https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/language-support#standard-voices)
183-
* @param outputFormat any {@link OUTPUT_FORMAT}
184-
* @param voiceLocale (optional) any voice locale that is supported by the voice. See the list of all voices for compatibility. If not provided, the locale will be inferred from the `voiceName`
185-
*/
247+
setConfig( conf:any) {
248+
this._arraybuffer = conf["arraybuffer"] ?? false;
249+
}
250+
186251
async setMetadata(voiceName: string, outputFormat: OUTPUT_FORMAT, voiceLocale?: string) {
187252
const oldVoice = this._voice;
188253
const oldVoiceLocale = this._voiceLocale;
@@ -213,54 +278,23 @@ export class MsEdgeTTS {
213278
"Speech synthesis not configured yet. Run setMetadata before calling toStream or toFile.");
214279
}
215280

216-
/**
217-
* Close the WebSocket connection.
218-
*/
219281
close() {
220282
this._ws.close();
221283
}
222284

223-
/**
224-
* Writes raw audio synthesised from text to a file. Uses a basic {@link _SSMLTemplate SML template}.
225-
*
226-
* @param path a valid output path, including a filename and file extension.
227-
* @param input the input to synthesise
228-
* @param options (optional) {@link ProsodyOptions}
229-
* @returns {Promise<string>} - a `Promise` with the full filepath
230-
*/
231285
toFile(path: string, input: string, options?: ProsodyOptions): Promise<string> {
232286
return this._rawSSMLRequestToFile(path, this._SSMLTemplate(input, options));
233287
}
234288

235-
/**
236-
* Writes raw audio synthesised from text in real-time to a {@link Readable}. Uses a basic {@link _SSMLTemplate SML template}.
237-
*
238-
* @param input the text to synthesise. Can include SSML elements.
239-
* @param options (optional) {@link ProsodyOptions}
240-
* @returns {Readable} - a `stream.Readable` with the audio data
241-
*/
242289
toStream(input: string, options?: ProsodyOptions): Readable {
243290
const {stream} = this._rawSSMLRequest(this._SSMLTemplate(input, options));
244291
return stream;
245292
}
246293

247-
/**
248-
* Writes raw audio synthesised from text to a file. Has no SSML template. Basic SSML should be provided in the request.
249-
*
250-
* @param path a valid output path, including a filename and file extension.
251-
* @param requestSSML the SSML to send. SSML elements required in order to work.
252-
* @returns {Promise<string>} - a `Promise` with the full filepath
253-
*/
254294
rawToFile(path: string, requestSSML: string): Promise<string> {
255295
return this._rawSSMLRequestToFile(path, requestSSML);
256296
}
257297

258-
/**
259-
* Writes raw audio synthesised from a request in real-time to a {@link Readable}. Has no SSML template. Basic SSML should be provided in the request.
260-
*
261-
* @param requestSSML the SSML to send. SSML elements required in order to work.
262-
* @returns {Readable} - a `stream.Readable` with the audio data
263-
*/
264298
rawToStream(requestSSML: string): Readable {
265299
const {stream} = this._rawSSMLRequest(requestSSML);
266300
return stream;
@@ -300,6 +334,7 @@ export class MsEdgeTTS {
300334
read() {
301335
},
302336
destroy(error: Error | null, callback: (error: (Error | null)) => void) {
337+
self._log("+_+_+_+__+_", error);
303338
delete self._streams[requestId];
304339
callback(error);
305340
},

0 commit comments

Comments
 (0)