diff --git a/.github/scripts/node-addon/package-optional.json b/.github/scripts/node-addon/package-optional.json index b3c71f9da..d2db2e192 100644 --- a/.github/scripts/node-addon/package-optional.json +++ b/.github/scripts/node-addon/package-optional.json @@ -1,7 +1,7 @@ { "name": "sherpa-onnx-PLATFORM2-ARCH", "version": "SHERPA_ONNX_VERSION", - "description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection", + "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection", "main": "index.js", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" @@ -16,8 +16,18 @@ "transcription", "real-time speech recognition", "without internet connection", + "locally", + "local", "embedded systems", "open source", + "diarization", + "speaker diarization", + "speaker recognition", + "speaker", + "speaker segmentation", + "speaker verification", + "spoken language identification", + "sherpa", "zipformer", "asr", "tts", @@ -30,13 +40,13 @@ "offline", "privacy", "open source", - "vad", - "speaker id", - "language id", - "node-addon-api", "streaming speech recognition", "speech", - "recognition" + "recognition", + "vad", + "node-addon-api", + "speaker id", + "language id" ], "author": "The next-gen Kaldi team", "license": "Apache-2.0", diff --git a/.github/scripts/node-addon/package.json b/.github/scripts/node-addon/package.json index 0444552fc..bc2d89e89 100644 --- a/.github/scripts/node-addon/package.json +++ b/.github/scripts/node-addon/package.json @@ -1,7 +1,7 @@ { "name": "sherpa-onnx-node", "version": "SHERPA_ONNX_VERSION", - "description": "Speech-to-text and text-to-speech using Next-gen Kaldi without internet connection", + "description": "Speech-to-text, text-to-speech, and speaker diarization using Next-gen Kaldi without internet connection", "main": "sherpa-onnx.js", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" @@ -16,8 +16,18 @@ "transcription", "real-time speech recognition", "without internet connection", + "locally", + "local", "embedded systems", "open source", + "diarization", + "speaker diarization", + "speaker recognition", + "speaker", + "speaker segmentation", + "speaker verification", + "spoken language identification", + "sherpa", "zipformer", "asr", "tts", @@ -30,13 +40,13 @@ "offline", "privacy", "open source", - "vad", - "speaker id", - "language id", - "node-addon-api", "streaming speech recognition", "speech", - "recognition" + "recognition", + "vad", + "node-addon-api", + "speaker id", + "language id" ], "author": "The next-gen Kaldi team", "license": "Apache-2.0", diff --git a/.github/scripts/test-nodejs-addon-npm.sh b/.github/scripts/test-nodejs-addon-npm.sh index a46e2de8e..42c753ebf 100755 --- a/.github/scripts/test-nodejs-addon-npm.sh +++ b/.github/scripts/test-nodejs-addon-npm.sh @@ -10,6 +10,20 @@ arch=$(node -p "require('os').arch()") platform=$(node -p "require('os').platform()") node_version=$(node -p "process.versions.node.split('.')[0]") +echo "----------non-streaming speaker diarization----------" + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + +curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + +node ./test_offline_speaker_diarization.js + +rm -rfv *.onnx *.wav sherpa-onnx-pyannote-* + echo "----------non-streaming asr + vad----------" curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-whisper-tiny.en.tar.bz2 tar xvf sherpa-onnx-whisper-tiny.en.tar.bz2 diff --git a/nodejs-addon-examples/README.md b/nodejs-addon-examples/README.md index ef0c25395..8851c6265 100644 --- a/nodejs-addon-examples/README.md +++ b/nodejs-addon-examples/README.md @@ -43,6 +43,12 @@ export LD_LIBRARY_PATH=$PWD/node_modules/.pnpm/sherpa-onnx-node@ +#include + +#include "macros.h" // NOLINT +#include "napi.h" // NOLINT +#include "sherpa-onnx/c-api/c-api.h" + +static SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig +GetOfflineSpeakerSegmentationPyannoteModelConfig(Napi::Object obj) { + SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("pyannote") || !obj.Get("pyannote").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("pyannote").As(); + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model); + + return c; +} + +static SherpaOnnxOfflineSpeakerSegmentationModelConfig +GetOfflineSpeakerSegmentationModelConfig(Napi::Object obj) { + SherpaOnnxOfflineSpeakerSegmentationModelConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("segmentation") || !obj.Get("segmentation").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("segmentation").As(); + + c.pyannote = GetOfflineSpeakerSegmentationPyannoteModelConfig(o); + + SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); + + if (o.Has("debug") && + (o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) { + if (o.Get("debug").IsBoolean()) { + c.debug = o.Get("debug").As().Value(); + } else { + c.debug = o.Get("debug").As().Int32Value(); + } + } + + SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider); + + return c; +} + +static SherpaOnnxSpeakerEmbeddingExtractorConfig +GetSpeakerEmbeddingExtractorConfig(Napi::Object obj) { + SherpaOnnxSpeakerEmbeddingExtractorConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("embedding") || !obj.Get("embedding").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("embedding").As(); + + SHERPA_ONNX_ASSIGN_ATTR_STR(model, model); + SHERPA_ONNX_ASSIGN_ATTR_INT32(num_threads, numThreads); + + if (o.Has("debug") && + (o.Get("debug").IsNumber() || o.Get("debug").IsBoolean())) { + if (o.Get("debug").IsBoolean()) { + c.debug = o.Get("debug").As().Value(); + } else { + c.debug = o.Get("debug").As().Int32Value(); + } + } + + SHERPA_ONNX_ASSIGN_ATTR_STR(provider, provider); + + return c; +} + +static SherpaOnnxFastClusteringConfig GetFastClusteringConfig( + Napi::Object obj) { + SherpaOnnxFastClusteringConfig c; + memset(&c, 0, sizeof(c)); + + if (!obj.Has("clustering") || !obj.Get("clustering").IsObject()) { + return c; + } + + Napi::Object o = obj.Get("clustering").As(); + + SHERPA_ONNX_ASSIGN_ATTR_INT32(num_clusters, numClusters); + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(threshold, threshold); + + return c; +} + +static Napi::External +CreateOfflineSpeakerDiarizationWrapper(const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsObject()) { + Napi::TypeError::New(env, "Expect an object as the argument") + .ThrowAsJavaScriptException(); + + return {}; + } + + Napi::Object o = info[0].As(); + + SherpaOnnxOfflineSpeakerDiarizationConfig c; + memset(&c, 0, sizeof(c)); + + c.segmentation = GetOfflineSpeakerSegmentationModelConfig(o); + c.embedding = GetSpeakerEmbeddingExtractorConfig(o); + c.clustering = GetFastClusteringConfig(o); + + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_duration_on, minDurationOn); + SHERPA_ONNX_ASSIGN_ATTR_FLOAT(min_duration_off, minDurationOff); + + const SherpaOnnxOfflineSpeakerDiarization *sd = + SherpaOnnxCreateOfflineSpeakerDiarization(&c); + + if (c.segmentation.pyannote.model) { + delete[] c.segmentation.pyannote.model; + } + + if (c.segmentation.provider) { + delete[] c.segmentation.provider; + } + + if (c.embedding.model) { + delete[] c.embedding.model; + } + + if (c.embedding.provider) { + delete[] c.embedding.provider; + } + + if (!sd) { + Napi::TypeError::New(env, "Please check your config!") + .ThrowAsJavaScriptException(); + + return {}; + } + + return Napi::External::New( + env, const_cast(sd), + [](Napi::Env env, SherpaOnnxOfflineSpeakerDiarization *sd) { + SherpaOnnxDestroyOfflineSpeakerDiarization(sd); + }); +} + +static Napi::Number OfflineSpeakerDiarizationGetSampleRateWrapper( + const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + + if (info.Length() != 1) { + std::ostringstream os; + os << "Expect only 1 argument. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New( + env, "Argument 0 should be an offline speaker diarization pointer.") + .ThrowAsJavaScriptException(); + + return {}; + } + + const SherpaOnnxOfflineSpeakerDiarization *sd = + info[0].As>().Data(); + + int32_t sample_rate = SherpaOnnxOfflineSpeakerDiarizationGetSampleRate(sd); + + return Napi::Number::New(env, sample_rate); +} + +static Napi::Array OfflineSpeakerDiarizationProcessWrapper( + const Napi::CallbackInfo &info) { + Napi::Env env = info.Env(); + + if (info.Length() != 2) { + std::ostringstream os; + os << "Expect only 2 arguments. Given: " << info.Length(); + + Napi::TypeError::New(env, os.str()).ThrowAsJavaScriptException(); + + return {}; + } + + if (!info[0].IsExternal()) { + Napi::TypeError::New( + env, "Argument 0 should be an offline speaker diarization pointer.") + .ThrowAsJavaScriptException(); + + return {}; + } + + const SherpaOnnxOfflineSpeakerDiarization *sd = + info[0].As>().Data(); + + if (!info[1].IsTypedArray()) { + Napi::TypeError::New(env, "Argument 1 should be a typed array") + .ThrowAsJavaScriptException(); + + return {}; + } + + Napi::Float32Array samples = info[1].As(); + + const SherpaOnnxOfflineSpeakerDiarizationResult *r = + SherpaOnnxOfflineSpeakerDiarizationProcess(sd, samples.Data(), + samples.ElementLength()); + + int32_t num_segments = + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments(r); + + const SherpaOnnxOfflineSpeakerDiarizationSegment *segments = + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime(r); + + Napi::Array ans = Napi::Array::New(env, num_segments); + + for (int32_t i = 0; i != num_segments; ++i) { + Napi::Object obj = Napi::Object::New(env); + obj.Set(Napi::String::New(env, "start"), segments[i].start); + obj.Set(Napi::String::New(env, "end"), segments[i].end); + obj.Set(Napi::String::New(env, "speaker"), segments[i].speaker); + + ans[i] = obj; + } + + SherpaOnnxOfflineSpeakerDiarizationDestroySegment(segments); + SherpaOnnxOfflineSpeakerDiarizationDestroyResult(r); + + return ans; +} + +void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports) { + exports.Set(Napi::String::New(env, "createOfflineSpeakerDiarization"), + Napi::Function::New(env, CreateOfflineSpeakerDiarizationWrapper)); + + exports.Set( + Napi::String::New(env, "getOfflineSpeakerDiarizationSampleRate"), + Napi::Function::New(env, OfflineSpeakerDiarizationGetSampleRateWrapper)); + + exports.Set( + Napi::String::New(env, "offlineSpeakerDiarizationProcess"), + Napi::Function::New(env, OfflineSpeakerDiarizationProcessWrapper)); +} diff --git a/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc b/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc index b86883d86..3f0affd79 100644 --- a/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc +++ b/scripts/node-addon-api/src/sherpa-onnx-node-addon-api.cc @@ -25,6 +25,8 @@ void InitPunctuation(Napi::Env env, Napi::Object exports); void InitKeywordSpotting(Napi::Env env, Napi::Object exports); +void InitNonStreamingSpeakerDiarization(Napi::Env env, Napi::Object exports); + Napi::Object Init(Napi::Env env, Napi::Object exports) { InitStreamingAsr(env, exports); InitNonStreamingAsr(env, exports); @@ -37,6 +39,7 @@ Napi::Object Init(Napi::Env env, Napi::Object exports) { InitAudioTagging(env, exports); InitPunctuation(env, exports); InitKeywordSpotting(env, exports); + InitNonStreamingSpeakerDiarization(env, exports); return exports; }