diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index 0aff2085e..27c21573a 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -4,6 +4,11 @@ set -ex cd dart-api-examples +pushd speaker-diarization +echo '----------speaker diarization----------' +./run.sh +popd + pushd speaker-identification echo '----------3d speaker----------' ./run-3d-speaker.sh diff --git a/.github/workflows/test-dart.yaml b/.github/workflows/test-dart.yaml index 58d505490..d9e27e86f 100644 --- a/.github/workflows/test-dart.yaml +++ b/.github/workflows/test-dart.yaml @@ -114,6 +114,7 @@ jobs: cp scripts/dart/audio-tagging-pubspec.yaml dart-api-examples/audio-tagging/pubspec.yaml cp scripts/dart/add-punctuations-pubspec.yaml dart-api-examples/add-punctuations/pubspec.yaml cp scripts/dart/speaker-id-pubspec.yaml dart-api-examples/speaker-identification/pubspec.yaml + cp scripts/dart/speaker-diarization-pubspec.yaml dart-api-examples/speaker-diarization/pubspec.yaml cp scripts/dart/sherpa-onnx-pubspec.yaml flutter/sherpa_onnx/pubspec.yaml diff --git a/dart-api-examples/speaker-diarization/README.md b/dart-api-examples/speaker-diarization/README.md index 3816eca3a..d4d8c4fd2 100644 --- a/dart-api-examples/speaker-diarization/README.md +++ b/dart-api-examples/speaker-diarization/README.md @@ -1,2 +1,7 @@ -A sample command-line application with an entrypoint in `bin/`, library code -in `lib/`, and example unit test in `test/`. +# Introduction + +This example shows how to use the Dart API from sherpa-onnx for speaker diarization. + +# Usage + +Please see [./run.sh](./run.sh) diff --git a/dart-api-examples/speaker-diarization/bin/init.dart b/dart-api-examples/speaker-diarization/bin/init.dart new file mode 120000 index 000000000..48508cfd3 --- /dev/null +++ b/dart-api-examples/speaker-diarization/bin/init.dart @@ -0,0 +1 @@ +../../vad/bin/init.dart \ No newline at end of file diff --git a/dart-api-examples/speaker-diarization/bin/speaker-diarization.dart b/dart-api-examples/speaker-diarization/bin/speaker-diarization.dart new file mode 100644 index 000000000..760adc868 --- /dev/null +++ b/dart-api-examples/speaker-diarization/bin/speaker-diarization.dart @@ -0,0 +1,100 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; +import 'dart:ffi'; + +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + /* Please use the following commands to download files used in this file + Step 1: Download a speaker segmentation model + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models + for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + + Step 2: Download a speaker embedding extractor model + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models + for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + + Step 3. Download test wave files + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models + for a list of available test wave files. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + + Step 4. Run it + */ + + final segmentationModel = + "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"; + + final embeddingModel = + "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"; + + final waveFilename = "./0-four-speakers-zh.wav"; + + final segmentationConfig = sherpa_onnx.OfflineSpeakerSegmentationModelConfig( + pyannote: sherpa_onnx.OfflineSpeakerSegmentationPyannoteModelConfig( + model: segmentationModel), + ); + + final embeddingConfig = + sherpa_onnx.SpeakerEmbeddingExtractorConfig(model: embeddingModel); + + // since we know there are 4 speakers in ./0-four-speakers-zh.wav, we set + // numClusters to 4. If you don't know the exact number, please set it to -1. + // in that case, you have to set threshold. A larger threshold leads to + // fewer clusters, i.e., fewer speakers. + final clusteringConfig = + sherpa_onnx.FastClusteringConfig(numClusters: 4, threshold: 0.5); + + var config = sherpa_onnx.OfflineSpeakerDiarizationConfig( + segmentation: segmentationConfig, + embedding: embeddingConfig, + clustering: clusteringConfig, + minDurationOn: 0.2, + minDurationOff: 0.5); + + final sd = sherpa_onnx.OfflineSpeakerDiarization(config); + if (sd.ptr == nullptr) { + return; + } + + final waveData = sherpa_onnx.readWave(waveFilename); + if (sd.sampleRate != waveData.sampleRate) { + print( + 'Expected sample rate: ${sd.sampleRate}, given: ${waveData.sampleRate}'); + return; + } + + print('started'); + + // Use the following statement if you don't want to use a callback + // final segments = sd.process(samples: waveData.samples); + + final segments = sd.processWithCallback( + samples: waveData.samples, + callback: (int numProcessedChunk, int numTotalChunks) { + final progress = 100.0 * numProcessedChunk / numTotalChunks; + + print('Progress ${progress.toStringAsFixed(2)}%'); + + return 0; + }); + + for (int i = 0; i < segments.length; ++i) { + print( + '${segments[i].start.toStringAsFixed(3)} -- ${segments[i].end.toStringAsFixed(3)} speaker_${segments[i].speaker}'); + } +} diff --git a/dart-api-examples/speaker-diarization/bin/speaker_diarization.dart b/dart-api-examples/speaker-diarization/bin/speaker_diarization.dart deleted file mode 100644 index 82a77f368..000000000 --- a/dart-api-examples/speaker-diarization/bin/speaker_diarization.dart +++ /dev/null @@ -1,5 +0,0 @@ -import 'package:speaker_diarization/speaker_diarization.dart' as speaker_diarization; - -void main(List arguments) { - print('Hello world: ${speaker_diarization.calculate()}!'); -} diff --git a/dart-api-examples/speaker-diarization/pubspec.yaml b/dart-api-examples/speaker-diarization/pubspec.yaml index a0d93941a..28154a49c 100644 --- a/dart-api-examples/speaker-diarization/pubspec.yaml +++ b/dart-api-examples/speaker-diarization/pubspec.yaml @@ -1,15 +1,17 @@ name: speaker_diarization -description: A sample command-line application. +description: > + This example demonstrates how to use the Dart API for speaker diarization. + version: 1.0.0 -# repository: https://github.com/my_org/my_repo environment: - sdk: ^3.4.0 + sdk: ">=3.0.0 <4.0.0" -# Add regular dependencies here. dependencies: - # path: ^1.8.0 + sherpa_onnx: ^1.10.27 + # sherpa_onnx: + # path: ../../flutter/sherpa_onnx + path: ^1.9.0 dev_dependencies: lints: ^3.0.0 - test: ^1.24.0 diff --git a/dart-api-examples/speaker-diarization/run.sh b/dart-api-examples/speaker-diarization/run.sh new file mode 100755 index 000000000..7717870dc --- /dev/null +++ b/dart-api-examples/speaker-diarization/run.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +fi + +if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +fi + +if [ ! -f ./0-four-speakers-zh.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav +fi + +dart run ./bin/speaker-diarization.dart diff --git a/flutter/sherpa_onnx/example/example.md b/flutter/sherpa_onnx/example/example.md index 7e7e8031d..0c24a79b2 100644 --- a/flutter/sherpa_onnx/example/example.md +++ b/flutter/sherpa_onnx/example/example.md @@ -11,6 +11,7 @@ | Functions | URL | Supported Platforms| |---|---|---| +|Speaker diarization| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/speaker-diarization)| macOS, Windows, Linux| |Streaming speech recognition| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/streaming-asr)| macOS, Windows, Linux| |Non-Streaming speech recognition| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/non-streaming-asr)| macOS, Windows, Linux| |Text to speech| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/tts)| macOS, Windows, Linux| diff --git a/flutter/sherpa_onnx/lib/sherpa_onnx.dart b/flutter/sherpa_onnx/lib/sherpa_onnx.dart index b15e67532..9fcd2872f 100644 --- a/flutter/sherpa_onnx/lib/sherpa_onnx.dart +++ b/flutter/sherpa_onnx/lib/sherpa_onnx.dart @@ -6,6 +6,7 @@ export 'src/audio_tagging.dart'; export 'src/feature_config.dart'; export 'src/keyword_spotter.dart'; export 'src/offline_recognizer.dart'; +export 'src/offline_speaker_diarization.dart'; export 'src/offline_stream.dart'; export 'src/online_recognizer.dart'; export 'src/online_stream.dart'; diff --git a/flutter/sherpa_onnx/lib/src/offline_speaker_diarization.dart b/flutter/sherpa_onnx/lib/src/offline_speaker_diarization.dart index 7c3b42239..ff9d34070 100644 --- a/flutter/sherpa_onnx/lib/src/offline_speaker_diarization.dart +++ b/flutter/sherpa_onnx/lib/src/offline_speaker_diarization.dart @@ -1,6 +1,7 @@ // Copyright (c) 2024 Xiaomi Corporation import 'dart:convert'; import 'dart:ffi'; +import 'dart:typed_data'; import 'package:ffi/ffi.dart'; @@ -9,7 +10,7 @@ import './speaker_identification.dart'; import './utils.dart'; class OfflineSpeakerDiarizationSegment { - OfflineSpeakerSegmentationSegment({ + const OfflineSpeakerDiarizationSegment({ required this.start, required this.end, required this.speaker, @@ -76,7 +77,7 @@ class FastClusteringConfig { class OfflineSpeakerDiarizationConfig { const OfflineSpeakerDiarizationConfig({ this.segmentation = const OfflineSpeakerSegmentationModelConfig(), - this.embedding = const SpeakerEmbeddingExtractorConfig(), + this.embedding = const SpeakerEmbeddingExtractorConfig(model: ''), this.clustering = const FastClusteringConfig(), this.minDurationOn = 0.2, this.minDurationOff = 0.5, @@ -95,7 +96,8 @@ class OfflineSpeakerDiarizationConfig { } class OfflineSpeakerDiarization { - OfflineSpeakerDiarization._({required this.ptr, required this.config}); + OfflineSpeakerDiarization._( + {required this.ptr, required this.config, required this.sampleRate}); void free() { SherpaOnnxBindings.sherpaOnnxDestroyOfflineSpeakerDiarization?.call(ptr); @@ -125,7 +127,7 @@ class OfflineSpeakerDiarization { c.ref.minDurationOff = config.minDurationOff; final ptr = - SherpaOnnxBindings.SherpaOnnxCreateOfflineSpeakerDiarization?.call(c) ?? + SherpaOnnxBindings.sherpaOnnxCreateOfflineSpeakerDiarization?.call(c) ?? nullptr; calloc.free(c.ref.embedding.provider); @@ -133,17 +135,21 @@ class OfflineSpeakerDiarization { calloc.free(c.ref.segmentation.provider); calloc.free(c.ref.segmentation.pyannote.model); - final sampleRate = SherpaOnnxBindings - .SherpaOnnxOfflineSpeakerDiarizationGetSampleRate?.call(ptr) ?? - 0; + int sampleRate = 0; + if (ptr != nullptr) { + sampleRate = SherpaOnnxBindings + .sherpaOnnxOfflineSpeakerDiarizationGetSampleRate + ?.call(ptr) ?? + 0; + } return OfflineSpeakerDiarization._( ptr: ptr, config: config, sampleRate: sampleRate); } List process( - {required samples: Float32List}) { + {required Float32List samples}) { if (ptr == nullptr) { - return []; + return []; } final n = samples.length; @@ -156,8 +162,53 @@ class OfflineSpeakerDiarization { ?.call(ptr, p, n) ?? nullptr; + final ans = _processImpl(r); + + SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroyResult + ?.call(r); + + return ans; + } + + List processWithCallback({ + required Float32List samples, + required int Function(int numProcessedChunks, int numTotalChunks) callback, + }) { + if (ptr == nullptr) { + return []; + } + + final n = samples.length; + final Pointer p = calloc(n); + + final pList = p.asTypedList(n); + pList.setAll(0, samples); + + final wrapper = NativeCallable< + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArgNative>.isolateLocal( + (int numProcessedChunks, int numTotalChunks) { + return callback(numProcessedChunks, numTotalChunks); + }, exceptionalReturn: 0); + + final r = SherpaOnnxBindings + .sherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg + ?.call(ptr, p, n, wrapper.nativeFunction) ?? + nullptr; + + wrapper.close(); + + final ans = _processImpl(r); + + SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroyResult + ?.call(r); + + return ans; + } + + List _processImpl( + Pointer r) { if (r == nullptr) { - return []; + return []; } final numSegments = SherpaOnnxBindings @@ -170,20 +221,20 @@ class OfflineSpeakerDiarization { nullptr; if (segments == nullptr) { - return []; + return []; } - final ans = []; + final ans = []; for (int i = 0; i != numSegments; ++i) { final s = segments.elementAt(i); - final tmp = OfflineSpeakerSegmentationSegment( + final tmp = OfflineSpeakerDiarizationSegment( start: s.ref.start, end: s.ref.end, speaker: s.ref.speaker); ans.add(tmp); } - SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroyResult - ?.call(r); + SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroySegment + ?.call(segments); return ans; } diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index 15d5e7968..8a8817d63 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -514,6 +514,27 @@ typedef SherpaOnnxOfflineSpeakerDiarizationProcess = Pointer Function( Pointer, Pointer, int); +typedef SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArgNative = Int32 + Function(Int32, Int32); + +typedef SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArgNative + = Pointer Function( + Pointer, + Pointer, + Int32, + Pointer< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArgNative>>); + +typedef SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg + = Pointer Function( + Pointer, + Pointer, + int, + Pointer< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArgNative>>); + typedef SherpaOnnxOfflineSpeakerDiarizationDestroyResultNative = Void Function( Pointer); @@ -1078,6 +1099,8 @@ class SherpaOnnxBindings { sherpaOnnxOfflineSpeakerDiarizationProcess; static SherpaOnnxOfflineSpeakerDiarizationDestroyResult? sherpaOnnxOfflineSpeakerDiarizationDestroyResult; + static SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg? + sherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg; static SherpaOnnxCreateOfflinePunctuation? sherpaOnnxCreateOfflinePunctuation; static SherpaOnnxDestroyOfflinePunctuation? @@ -1249,7 +1272,7 @@ class SherpaOnnxBindings { sherpaOnnxCreateOfflineSpeakerDiarization ??= dynamicLibrary .lookup< NativeFunction< - SherpaOnnxCreateOfflineSpeakerDiarizatioNativen>>( + SherpaOnnxCreateOfflineSpeakerDiarizationNative>>( 'SherpaOnnxCreateOfflineSpeakerDiarization') .asFunction(); @@ -1309,6 +1332,13 @@ class SherpaOnnxBindings { 'SherpaOnnxOfflineSpeakerDiarizationProcess') .asFunction(); + sherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArgNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg') + .asFunction(); + sherpaOnnxOfflineSpeakerDiarizationDestroyResult ??= dynamicLibrary .lookup< NativeFunction< diff --git a/flutter/sherpa_onnx/pubspec.yaml b/flutter/sherpa_onnx/pubspec.yaml index 5b693ef0b..e92071833 100644 --- a/flutter/sherpa_onnx/pubspec.yaml +++ b/flutter/sherpa_onnx/pubspec.yaml @@ -1,8 +1,8 @@ name: sherpa_onnx description: > - Speech recognition, speech synthesis, and speaker recognition using next-gen Kaldi - with onnxruntime without Internet connection. + Speech recognition, speech synthesis, speaker diarization, and speaker recognition + using next-gen Kaldi with onnxruntime without Internet connection. repository: https://github.com/k2-fsa/sherpa-onnx/tree/master/flutter @@ -12,7 +12,7 @@ documentation: https://k2-fsa.github.io/sherpa/onnx/ topics: - speech-recognition - speech-synthesis - - speaker-identification + - speaker-diarization - audio-tagging - voice-activity-detection @@ -41,7 +41,7 @@ dependencies: sherpa_onnx_linux: ^1.10.27 # sherpa_onnx_linux: # path: ../sherpa_onnx_linux - # + sherpa_onnx_windows: ^1.10.27 # sherpa_onnx_windows: # path: ../sherpa_onnx_windows diff --git a/scripts/dart/speaker-diarization-pubspec.yaml b/scripts/dart/speaker-diarization-pubspec.yaml new file mode 100644 index 000000000..fec147e75 --- /dev/null +++ b/scripts/dart/speaker-diarization-pubspec.yaml @@ -0,0 +1,16 @@ +name: speaker_diarization +description: > + This example demonstrates how to use the Dart API for speaker diarization. + +version: 1.0.0 + +environment: + sdk: ">=3.0.0 <4.0.0" + +dependencies: + sherpa_onnx: + path: ../../flutter/sherpa_onnx + path: ^1.9.0 + +dev_dependencies: + lints: ^3.0.0 diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index abcfc5b82..4ba0a4a60 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -1828,4 +1828,20 @@ SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback( return ans; } +const SherpaOnnxOfflineSpeakerDiarizationResult * +SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg( + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples, + int32_t n, + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg callback) { + auto wrapper = [callback](int32_t num_processed_chunks, + int32_t num_total_chunks, void *) { + return callback(num_processed_chunks, num_total_chunks); + }; + + auto ans = new SherpaOnnxOfflineSpeakerDiarizationResult; + ans->impl = sd->impl->Process(samples, n, wrapper); + + return ans; +} + #endif diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index c9e7f9ee1..4b41a81a9 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -1485,6 +1485,9 @@ SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroySegment( typedef int32_t (*SherpaOnnxOfflineSpeakerDiarizationProgressCallback)( int32_t num_processed_chunk, int32_t num_total_chunks, void *arg); +typedef int32_t (*SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg)( + int32_t num_processed_chunk, int32_t num_total_chunks); + // The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroyResult() // to free the returned pointer to avoid memory leak. SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult * @@ -1500,6 +1503,12 @@ SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback( int32_t n, SherpaOnnxOfflineSpeakerDiarizationProgressCallback callback, void *arg); +SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult * +SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg( + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples, + int32_t n, + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg callback); + SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroyResult( const SherpaOnnxOfflineSpeakerDiarizationResult *r);