diff --git a/.github/scripts/test-dart.sh b/.github/scripts/test-dart.sh index 0aff2085e..27c21573a 100755 --- a/.github/scripts/test-dart.sh +++ b/.github/scripts/test-dart.sh @@ -4,6 +4,11 @@ set -ex cd dart-api-examples +pushd speaker-diarization +echo '----------speaker diarization----------' +./run.sh +popd + pushd speaker-identification echo '----------3d speaker----------' ./run-3d-speaker.sh diff --git a/.github/workflows/test-dart.yaml b/.github/workflows/test-dart.yaml index 58d505490..d9e27e86f 100644 --- a/.github/workflows/test-dart.yaml +++ b/.github/workflows/test-dart.yaml @@ -114,6 +114,7 @@ jobs: cp scripts/dart/audio-tagging-pubspec.yaml dart-api-examples/audio-tagging/pubspec.yaml cp scripts/dart/add-punctuations-pubspec.yaml dart-api-examples/add-punctuations/pubspec.yaml cp scripts/dart/speaker-id-pubspec.yaml dart-api-examples/speaker-identification/pubspec.yaml + cp scripts/dart/speaker-diarization-pubspec.yaml dart-api-examples/speaker-diarization/pubspec.yaml cp scripts/dart/sherpa-onnx-pubspec.yaml flutter/sherpa_onnx/pubspec.yaml diff --git a/dart-api-examples/README.md b/dart-api-examples/README.md index 9370372e7..3d66cb04e 100644 --- a/dart-api-examples/README.md +++ b/dart-api-examples/README.md @@ -9,6 +9,7 @@ https://pub.dev/packages/sherpa_onnx | Directory | Description | |-----------|-------------| +| [./speaker-diarization](./speaker-diarization)| Example for speaker diarization.| | [./add-punctuations](./add-punctuations)| Example for adding punctuations to text.| | [./audio-tagging](./audio-tagging)| Example for audio tagging.| | [./keyword-spotter](./keyword-spotter)| Example for keyword spotting| diff --git a/dart-api-examples/speaker-diarization/.gitignore b/dart-api-examples/speaker-diarization/.gitignore new file mode 100644 index 000000000..3a8579040 --- /dev/null +++ b/dart-api-examples/speaker-diarization/.gitignore @@ -0,0 +1,3 @@ +# https://dart.dev/guides/libraries/private-files +# Created by `dart pub` +.dart_tool/ diff --git a/dart-api-examples/speaker-diarization/CHANGELOG.md b/dart-api-examples/speaker-diarization/CHANGELOG.md new file mode 100644 index 000000000..effe43c82 --- /dev/null +++ b/dart-api-examples/speaker-diarization/CHANGELOG.md @@ -0,0 +1,3 @@ +## 1.0.0 + +- Initial version. diff --git a/dart-api-examples/speaker-diarization/README.md b/dart-api-examples/speaker-diarization/README.md new file mode 100644 index 000000000..d4d8c4fd2 --- /dev/null +++ b/dart-api-examples/speaker-diarization/README.md @@ -0,0 +1,7 @@ +# Introduction + +This example shows how to use the Dart API from sherpa-onnx for speaker diarization. + +# Usage + +Please see [./run.sh](./run.sh) diff --git a/dart-api-examples/speaker-diarization/analysis_options.yaml b/dart-api-examples/speaker-diarization/analysis_options.yaml new file mode 100644 index 000000000..dee8927aa --- /dev/null +++ b/dart-api-examples/speaker-diarization/analysis_options.yaml @@ -0,0 +1,30 @@ +# This file configures the static analysis results for your project (errors, +# warnings, and lints). +# +# This enables the 'recommended' set of lints from `package:lints`. +# This set helps identify many issues that may lead to problems when running +# or consuming Dart code, and enforces writing Dart using a single, idiomatic +# style and format. +# +# If you want a smaller set of lints you can change this to specify +# 'package:lints/core.yaml'. These are just the most critical lints +# (the recommended set includes the core lints). +# The core lints are also what is used by pub.dev for scoring packages. + +include: package:lints/recommended.yaml + +# Uncomment the following section to specify additional rules. + +# linter: +# rules: +# - camel_case_types + +# analyzer: +# exclude: +# - path/to/excluded/files/** + +# For more information about the core and recommended set of lints, see +# https://dart.dev/go/core-lints + +# For additional information about configuring this file, see +# https://dart.dev/guides/language/analysis-options diff --git a/dart-api-examples/speaker-diarization/bin/init.dart b/dart-api-examples/speaker-diarization/bin/init.dart new file mode 120000 index 000000000..48508cfd3 --- /dev/null +++ b/dart-api-examples/speaker-diarization/bin/init.dart @@ -0,0 +1 @@ +../../vad/bin/init.dart \ No newline at end of file diff --git a/dart-api-examples/speaker-diarization/bin/speaker-diarization.dart b/dart-api-examples/speaker-diarization/bin/speaker-diarization.dart new file mode 100644 index 000000000..760adc868 --- /dev/null +++ b/dart-api-examples/speaker-diarization/bin/speaker-diarization.dart @@ -0,0 +1,100 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:io'; +import 'dart:typed_data'; +import 'dart:ffi'; + +import 'package:sherpa_onnx/sherpa_onnx.dart' as sherpa_onnx; +import './init.dart'; + +void main(List arguments) async { + await initSherpaOnnx(); + + /* Please use the following commands to download files used in this file + Step 1: Download a speaker segmentation model + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models + for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + + Step 2: Download a speaker embedding extractor model + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models + for a list of available models. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx + + Step 3. Download test wave files + + Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models + for a list of available test wave files. The following is an example + + wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav + + Step 4. Run it + */ + + final segmentationModel = + "./sherpa-onnx-pyannote-segmentation-3-0/model.onnx"; + + final embeddingModel = + "./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx"; + + final waveFilename = "./0-four-speakers-zh.wav"; + + final segmentationConfig = sherpa_onnx.OfflineSpeakerSegmentationModelConfig( + pyannote: sherpa_onnx.OfflineSpeakerSegmentationPyannoteModelConfig( + model: segmentationModel), + ); + + final embeddingConfig = + sherpa_onnx.SpeakerEmbeddingExtractorConfig(model: embeddingModel); + + // since we know there are 4 speakers in ./0-four-speakers-zh.wav, we set + // numClusters to 4. If you don't know the exact number, please set it to -1. + // in that case, you have to set threshold. A larger threshold leads to + // fewer clusters, i.e., fewer speakers. + final clusteringConfig = + sherpa_onnx.FastClusteringConfig(numClusters: 4, threshold: 0.5); + + var config = sherpa_onnx.OfflineSpeakerDiarizationConfig( + segmentation: segmentationConfig, + embedding: embeddingConfig, + clustering: clusteringConfig, + minDurationOn: 0.2, + minDurationOff: 0.5); + + final sd = sherpa_onnx.OfflineSpeakerDiarization(config); + if (sd.ptr == nullptr) { + return; + } + + final waveData = sherpa_onnx.readWave(waveFilename); + if (sd.sampleRate != waveData.sampleRate) { + print( + 'Expected sample rate: ${sd.sampleRate}, given: ${waveData.sampleRate}'); + return; + } + + print('started'); + + // Use the following statement if you don't want to use a callback + // final segments = sd.process(samples: waveData.samples); + + final segments = sd.processWithCallback( + samples: waveData.samples, + callback: (int numProcessedChunk, int numTotalChunks) { + final progress = 100.0 * numProcessedChunk / numTotalChunks; + + print('Progress ${progress.toStringAsFixed(2)}%'); + + return 0; + }); + + for (int i = 0; i < segments.length; ++i) { + print( + '${segments[i].start.toStringAsFixed(3)} -- ${segments[i].end.toStringAsFixed(3)} speaker_${segments[i].speaker}'); + } +} diff --git a/dart-api-examples/speaker-diarization/pubspec.yaml b/dart-api-examples/speaker-diarization/pubspec.yaml new file mode 100644 index 000000000..28154a49c --- /dev/null +++ b/dart-api-examples/speaker-diarization/pubspec.yaml @@ -0,0 +1,17 @@ +name: speaker_diarization +description: > + This example demonstrates how to use the Dart API for speaker diarization. + +version: 1.0.0 + +environment: + sdk: ">=3.0.0 <4.0.0" + +dependencies: + sherpa_onnx: ^1.10.27 + # sherpa_onnx: + # path: ../../flutter/sherpa_onnx + path: ^1.9.0 + +dev_dependencies: + lints: ^3.0.0 diff --git a/dart-api-examples/speaker-diarization/run.sh b/dart-api-examples/speaker-diarization/run.sh new file mode 100755 index 000000000..7717870dc --- /dev/null +++ b/dart-api-examples/speaker-diarization/run.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +set -ex + +dart pub get + +if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 + rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2 +fi + +if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx +fi + +if [ ! -f ./0-four-speakers-zh.wav ]; then + curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav +fi + +dart run ./bin/speaker-diarization.dart diff --git a/flutter/sherpa_onnx/example/example.md b/flutter/sherpa_onnx/example/example.md index 7e7e8031d..0c24a79b2 100644 --- a/flutter/sherpa_onnx/example/example.md +++ b/flutter/sherpa_onnx/example/example.md @@ -11,6 +11,7 @@ | Functions | URL | Supported Platforms| |---|---|---| +|Speaker diarization| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/speaker-diarization)| macOS, Windows, Linux| |Streaming speech recognition| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/streaming-asr)| macOS, Windows, Linux| |Non-Streaming speech recognition| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/non-streaming-asr)| macOS, Windows, Linux| |Text to speech| [Address](https://github.com/k2-fsa/sherpa-onnx/tree/master/dart-api-examples/tts)| macOS, Windows, Linux| diff --git a/flutter/sherpa_onnx/lib/sherpa_onnx.dart b/flutter/sherpa_onnx/lib/sherpa_onnx.dart index b15e67532..9fcd2872f 100644 --- a/flutter/sherpa_onnx/lib/sherpa_onnx.dart +++ b/flutter/sherpa_onnx/lib/sherpa_onnx.dart @@ -6,6 +6,7 @@ export 'src/audio_tagging.dart'; export 'src/feature_config.dart'; export 'src/keyword_spotter.dart'; export 'src/offline_recognizer.dart'; +export 'src/offline_speaker_diarization.dart'; export 'src/offline_stream.dart'; export 'src/online_recognizer.dart'; export 'src/online_stream.dart'; diff --git a/flutter/sherpa_onnx/lib/src/offline_speaker_diarization.dart b/flutter/sherpa_onnx/lib/src/offline_speaker_diarization.dart new file mode 100644 index 000000000..5981e3c04 --- /dev/null +++ b/flutter/sherpa_onnx/lib/src/offline_speaker_diarization.dart @@ -0,0 +1,243 @@ +// Copyright (c) 2024 Xiaomi Corporation +import 'dart:ffi'; +import 'dart:typed_data'; + +import 'package:ffi/ffi.dart'; + +import './sherpa_onnx_bindings.dart'; +import './speaker_identification.dart'; + +class OfflineSpeakerDiarizationSegment { + const OfflineSpeakerDiarizationSegment({ + required this.start, + required this.end, + required this.speaker, + }); + + @override + String toString() { + return 'OfflineSpeakerDiarizationSegment(start: $start, end: $end, speaker: $speaker)'; + } + + final double start; + final double end; + final int speaker; +} + +class OfflineSpeakerSegmentationPyannoteModelConfig { + const OfflineSpeakerSegmentationPyannoteModelConfig({ + this.model = '', + }); + + @override + String toString() { + return 'OfflineSpeakerSegmentationPyannoteModelConfig(model: $model)'; + } + + final String model; +} + +class OfflineSpeakerSegmentationModelConfig { + const OfflineSpeakerSegmentationModelConfig({ + this.pyannote = const OfflineSpeakerSegmentationPyannoteModelConfig(), + this.numThreads = 1, + this.debug = true, + this.provider = 'cpu', + }); + + @override + String toString() { + return 'OfflineSpeakerSegmentationModelConfig(pyannote: $pyannote, numThreads: $numThreads, debug: $debug, provider: $provider)'; + } + + final OfflineSpeakerSegmentationPyannoteModelConfig pyannote; + + final int numThreads; + final bool debug; + final String provider; +} + +class FastClusteringConfig { + const FastClusteringConfig({ + this.numClusters = -1, + this.threshold = 0.5, + }); + + @override + String toString() { + return 'FastClusteringConfig(numClusters: $numClusters, threshold: $threshold)'; + } + + final int numClusters; + final double threshold; +} + +class OfflineSpeakerDiarizationConfig { + const OfflineSpeakerDiarizationConfig({ + this.segmentation = const OfflineSpeakerSegmentationModelConfig(), + this.embedding = const SpeakerEmbeddingExtractorConfig(model: ''), + this.clustering = const FastClusteringConfig(), + this.minDurationOn = 0.2, + this.minDurationOff = 0.5, + }); + + @override + String toString() { + return 'OfflineSpeakerDiarizationConfig(segmentation: $segmentation, embedding: $embedding, clustering: $clustering, minDurationOn: $minDurationOn, minDurationOff: $minDurationOff)'; + } + + final OfflineSpeakerSegmentationModelConfig segmentation; + final SpeakerEmbeddingExtractorConfig embedding; + final FastClusteringConfig clustering; + final double minDurationOff; // in seconds + final double minDurationOn; // in seconds +} + +class OfflineSpeakerDiarization { + OfflineSpeakerDiarization._( + {required this.ptr, required this.config, required this.sampleRate}); + + void free() { + SherpaOnnxBindings.sherpaOnnxDestroyOfflineSpeakerDiarization?.call(ptr); + ptr = nullptr; + } + + /// The user is responsible to call the OfflineSpeakerDiarization.free() + /// method of the returned instance to avoid memory leak. + factory OfflineSpeakerDiarization(OfflineSpeakerDiarizationConfig config) { + final c = calloc(); + + c.ref.segmentation.pyannote.model = + config.segmentation.pyannote.model.toNativeUtf8(); + c.ref.segmentation.numThreads = config.segmentation.numThreads; + c.ref.segmentation.debug = config.segmentation.debug ? 1 : 0; + c.ref.segmentation.provider = config.segmentation.provider.toNativeUtf8(); + + c.ref.embedding.model = config.embedding.model.toNativeUtf8(); + c.ref.embedding.numThreads = config.embedding.numThreads; + c.ref.embedding.debug = config.embedding.debug ? 1 : 0; + c.ref.embedding.provider = config.embedding.provider.toNativeUtf8(); + + c.ref.clustering.numClusters = config.clustering.numClusters; + c.ref.clustering.threshold = config.clustering.threshold; + + c.ref.minDurationOn = config.minDurationOn; + c.ref.minDurationOff = config.minDurationOff; + + final ptr = + SherpaOnnxBindings.sherpaOnnxCreateOfflineSpeakerDiarization?.call(c) ?? + nullptr; + + calloc.free(c.ref.embedding.provider); + calloc.free(c.ref.embedding.model); + calloc.free(c.ref.segmentation.provider); + calloc.free(c.ref.segmentation.pyannote.model); + + int sampleRate = 0; + if (ptr != nullptr) { + sampleRate = SherpaOnnxBindings + .sherpaOnnxOfflineSpeakerDiarizationGetSampleRate + ?.call(ptr) ?? + 0; + } + return OfflineSpeakerDiarization._( + ptr: ptr, config: config, sampleRate: sampleRate); + } + + List process( + {required Float32List samples}) { + if (ptr == nullptr) { + return []; + } + + final n = samples.length; + final Pointer p = calloc(n); + + final pList = p.asTypedList(n); + pList.setAll(0, samples); + + final r = SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationProcess + ?.call(ptr, p, n) ?? + nullptr; + + final ans = _processImpl(r); + + SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroyResult + ?.call(r); + + return ans; + } + + List processWithCallback({ + required Float32List samples, + required int Function(int numProcessedChunks, int numTotalChunks) callback, + }) { + if (ptr == nullptr) { + return []; + } + + final n = samples.length; + final Pointer p = calloc(n); + + final pList = p.asTypedList(n); + pList.setAll(0, samples); + + final wrapper = NativeCallable< + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArgNative>.isolateLocal( + (int numProcessedChunks, int numTotalChunks) { + return callback(numProcessedChunks, numTotalChunks); + }, exceptionalReturn: 0); + + final r = SherpaOnnxBindings + .sherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg + ?.call(ptr, p, n, wrapper.nativeFunction) ?? + nullptr; + + wrapper.close(); + + final ans = _processImpl(r); + + SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroyResult + ?.call(r); + + return ans; + } + + List _processImpl( + Pointer r) { + if (r == nullptr) { + return []; + } + + final numSegments = SherpaOnnxBindings + .sherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments + ?.call(r) ?? + 0; + final segments = SherpaOnnxBindings + .sherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime + ?.call(r) ?? + nullptr; + + if (segments == nullptr) { + return []; + } + + final ans = []; + for (int i = 0; i != numSegments; ++i) { + final s = segments + i; + + final tmp = OfflineSpeakerDiarizationSegment( + start: s.ref.start, end: s.ref.end, speaker: s.ref.speaker); + ans.add(tmp); + } + + SherpaOnnxBindings.sherpaOnnxOfflineSpeakerDiarizationDestroySegment + ?.call(segments); + + return ans; + } + + Pointer ptr; + OfflineSpeakerDiarizationConfig config; + final int sampleRate; +} diff --git a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart index 42294c2d4..8a8817d63 100644 --- a/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart +++ b/flutter/sherpa_onnx/lib/src/sherpa_onnx_bindings.dart @@ -2,6 +2,66 @@ import 'dart:ffi'; import 'package:ffi/ffi.dart'; +final class SherpaOnnxSpeakerEmbeddingExtractorConfig extends Struct { + external Pointer model; + + @Int32() + external int numThreads; + + @Int32() + external int debug; + + external Pointer provider; +} + +final class SherpaOnnxOfflineSpeakerDiarizationSegment extends Struct { + @Float() + external double start; + + @Float() + external double end; + + @Int32() + external int speaker; +} + +final class SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig + extends Struct { + external Pointer model; +} + +final class SherpaOnnxOfflineSpeakerSegmentationModelConfig extends Struct { + external SherpaOnnxOfflineSpeakerSegmentationPyannoteModelConfig pyannote; + + @Int32() + external int numThreads; + + @Int32() + external int debug; + + external Pointer provider; +} + +final class SherpaOnnxFastClusteringConfig extends Struct { + @Int32() + external int numClusters; + + @Float() + external double threshold; +} + +final class SherpaOnnxOfflineSpeakerDiarizationConfig extends Struct { + external SherpaOnnxOfflineSpeakerSegmentationModelConfig segmentation; + external SherpaOnnxSpeakerEmbeddingExtractorConfig embedding; + external SherpaOnnxFastClusteringConfig clustering; + + @Float() + external double minDurationOn; + + @Float() + external double minDurationOff; +} + final class SherpaOnnxOfflinePunctuationModelConfig extends Struct { external Pointer ctTransformer; @@ -341,18 +401,6 @@ final class SherpaOnnxWave extends Struct { external int numSamples; } -final class SherpaOnnxSpeakerEmbeddingExtractorConfig extends Struct { - external Pointer model; - - @Int32() - external int numThreads; - - @Int32() - external int debug; - - external Pointer provider; -} - final class SherpaOnnxKeywordSpotterConfig extends Struct { external SherpaOnnxFeatureConfig feat; @@ -402,10 +450,101 @@ final class SherpaOnnxSpeakerEmbeddingExtractor extends Opaque {} final class SherpaOnnxSpeakerEmbeddingManager extends Opaque {} +final class SherpaOnnxOfflineSpeakerDiarization extends Opaque {} + +final class SherpaOnnxOfflineSpeakerDiarizationResult extends Opaque {} + +typedef SherpaOnnxCreateOfflineSpeakerDiarizationNative + = Pointer Function( + Pointer); + +typedef SherpaOnnxCreateOfflineSpeakerDiarization + = SherpaOnnxCreateOfflineSpeakerDiarizationNative; + +typedef SherpaOnnxDestroyOfflineSpeakerDiarizationNative = Void Function( + Pointer); + +typedef SherpaOnnxDestroyOfflineSpeakerDiarization = void Function( + Pointer); + typedef SherpaOnnxCreateOfflinePunctuationNative = Pointer Function( Pointer); +typedef SherpaOnnxOfflineSpeakerDiarizationGetSampleRateNative = Int32 Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationGetSampleRate = int Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationSetConfigNative = Void Function( + Pointer, + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakersNative = Int32 + Function(Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers = int Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegmentsNative = Int32 + Function(Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments = int Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTimeNative + = Pointer Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime + = SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTimeNative; + +typedef SherpaOnnxOfflineSpeakerDiarizationDestroySegmentNative = Void Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationDestroySegment = void Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationProcessNative + = Pointer Function( + Pointer, Pointer, Int32); + +typedef SherpaOnnxOfflineSpeakerDiarizationProcess + = Pointer Function( + Pointer, Pointer, int); + +typedef SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArgNative = Int32 + Function(Int32, Int32); + +typedef SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArgNative + = Pointer Function( + Pointer, + Pointer, + Int32, + Pointer< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArgNative>>); + +typedef SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg + = Pointer Function( + Pointer, + Pointer, + int, + Pointer< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArgNative>>); + +typedef SherpaOnnxOfflineSpeakerDiarizationDestroyResultNative = Void Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationDestroyResult = void Function( + Pointer); + +typedef SherpaOnnxOfflineSpeakerDiarizationSetConfig = void Function( + Pointer, + Pointer); + typedef SherpaOnnxCreateOfflinePunctuation = SherpaOnnxCreateOfflinePunctuationNative; @@ -940,6 +1079,29 @@ typedef SherpaOnnxFreeWaveNative = Void Function(Pointer); typedef SherpaOnnxFreeWave = void Function(Pointer); class SherpaOnnxBindings { + static SherpaOnnxCreateOfflineSpeakerDiarization? + sherpaOnnxCreateOfflineSpeakerDiarization; + static SherpaOnnxDestroyOfflineSpeakerDiarization? + sherpaOnnxDestroyOfflineSpeakerDiarization; + static SherpaOnnxOfflineSpeakerDiarizationGetSampleRate? + sherpaOnnxOfflineSpeakerDiarizationGetSampleRate; + static SherpaOnnxOfflineSpeakerDiarizationSetConfig? + sherpaOnnxOfflineSpeakerDiarizationSetConfig; + static SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers? + sherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers; + static SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments? + sherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments; + static SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime? + sherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime; + static SherpaOnnxOfflineSpeakerDiarizationDestroySegment? + sherpaOnnxOfflineSpeakerDiarizationDestroySegment; + static SherpaOnnxOfflineSpeakerDiarizationProcess? + sherpaOnnxOfflineSpeakerDiarizationProcess; + static SherpaOnnxOfflineSpeakerDiarizationDestroyResult? + sherpaOnnxOfflineSpeakerDiarizationDestroyResult; + static SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg? + sherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg; + static SherpaOnnxCreateOfflinePunctuation? sherpaOnnxCreateOfflinePunctuation; static SherpaOnnxDestroyOfflinePunctuation? sherpaOnnxDestroyOfflinePunctuation; @@ -1107,6 +1269,83 @@ class SherpaOnnxBindings { static SherpaOnnxFreeWave? freeWave; static void init(DynamicLibrary dynamicLibrary) { + sherpaOnnxCreateOfflineSpeakerDiarization ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxCreateOfflineSpeakerDiarizationNative>>( + 'SherpaOnnxCreateOfflineSpeakerDiarization') + .asFunction(); + + sherpaOnnxDestroyOfflineSpeakerDiarization ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxDestroyOfflineSpeakerDiarizationNative>>( + 'SherpaOnnxDestroyOfflineSpeakerDiarization') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationGetSampleRate ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationGetSampleRateNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationGetSampleRate') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationSetConfig ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationSetConfigNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationSetConfig') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakersNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationResultGetNumSpeakers') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegmentsNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationResultGetNumSegments') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTimeNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationResultSortByStartTime') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationDestroySegment ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationDestroySegmentNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationDestroySegment') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationProcess ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationProcessNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationProcess') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArgNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg') + .asFunction(); + + sherpaOnnxOfflineSpeakerDiarizationDestroyResult ??= dynamicLibrary + .lookup< + NativeFunction< + SherpaOnnxOfflineSpeakerDiarizationDestroyResultNative>>( + 'SherpaOnnxOfflineSpeakerDiarizationDestroyResult') + .asFunction(); + sherpaOnnxCreateOfflinePunctuation ??= dynamicLibrary .lookup>( 'SherpaOnnxCreateOfflinePunctuation') diff --git a/flutter/sherpa_onnx/pubspec.yaml b/flutter/sherpa_onnx/pubspec.yaml index 5b693ef0b..e92071833 100644 --- a/flutter/sherpa_onnx/pubspec.yaml +++ b/flutter/sherpa_onnx/pubspec.yaml @@ -1,8 +1,8 @@ name: sherpa_onnx description: > - Speech recognition, speech synthesis, and speaker recognition using next-gen Kaldi - with onnxruntime without Internet connection. + Speech recognition, speech synthesis, speaker diarization, and speaker recognition + using next-gen Kaldi with onnxruntime without Internet connection. repository: https://github.com/k2-fsa/sherpa-onnx/tree/master/flutter @@ -12,7 +12,7 @@ documentation: https://k2-fsa.github.io/sherpa/onnx/ topics: - speech-recognition - speech-synthesis - - speaker-identification + - speaker-diarization - audio-tagging - voice-activity-detection @@ -41,7 +41,7 @@ dependencies: sherpa_onnx_linux: ^1.10.27 # sherpa_onnx_linux: # path: ../sherpa_onnx_linux - # + sherpa_onnx_windows: ^1.10.27 # sherpa_onnx_windows: # path: ../sherpa_onnx_windows diff --git a/scripts/dart/speaker-diarization-pubspec.yaml b/scripts/dart/speaker-diarization-pubspec.yaml new file mode 100644 index 000000000..fec147e75 --- /dev/null +++ b/scripts/dart/speaker-diarization-pubspec.yaml @@ -0,0 +1,16 @@ +name: speaker_diarization +description: > + This example demonstrates how to use the Dart API for speaker diarization. + +version: 1.0.0 + +environment: + sdk: ">=3.0.0 <4.0.0" + +dependencies: + sherpa_onnx: + path: ../../flutter/sherpa_onnx + path: ^1.9.0 + +dev_dependencies: + lints: ^3.0.0 diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index abcfc5b82..4ba0a4a60 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -1828,4 +1828,20 @@ SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback( return ans; } +const SherpaOnnxOfflineSpeakerDiarizationResult * +SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg( + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples, + int32_t n, + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg callback) { + auto wrapper = [callback](int32_t num_processed_chunks, + int32_t num_total_chunks, void *) { + return callback(num_processed_chunks, num_total_chunks); + }; + + auto ans = new SherpaOnnxOfflineSpeakerDiarizationResult; + ans->impl = sd->impl->Process(samples, n, wrapper); + + return ans; +} + #endif diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index c9e7f9ee1..4b41a81a9 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -1485,6 +1485,9 @@ SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroySegment( typedef int32_t (*SherpaOnnxOfflineSpeakerDiarizationProgressCallback)( int32_t num_processed_chunk, int32_t num_total_chunks, void *arg); +typedef int32_t (*SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg)( + int32_t num_processed_chunk, int32_t num_total_chunks); + // The user has to invoke SherpaOnnxOfflineSpeakerDiarizationDestroyResult() // to free the returned pointer to avoid memory leak. SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult * @@ -1500,6 +1503,12 @@ SherpaOnnxOfflineSpeakerDiarizationProcessWithCallback( int32_t n, SherpaOnnxOfflineSpeakerDiarizationProgressCallback callback, void *arg); +SHERPA_ONNX_API const SherpaOnnxOfflineSpeakerDiarizationResult * +SherpaOnnxOfflineSpeakerDiarizationProcessWithCallbackNoArg( + const SherpaOnnxOfflineSpeakerDiarization *sd, const float *samples, + int32_t n, + SherpaOnnxOfflineSpeakerDiarizationProgressCallbackNoArg callback); + SHERPA_ONNX_API void SherpaOnnxOfflineSpeakerDiarizationDestroyResult( const SherpaOnnxOfflineSpeakerDiarizationResult *r); diff --git a/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h b/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h index 8f669e27c..0c70f0bc6 100644 --- a/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h +++ b/sherpa-onnx/csrc/offline-speaker-diarization-pyannote-impl.h @@ -5,6 +5,7 @@ #define SHERPA_ONNX_CSRC_OFFLINE_SPEAKER_DIARIZATION_PYANNOTE_IMPL_H_ #include +#include #include #include #include diff --git a/sherpa-onnx/jni/offline-speaker-diarization.cc b/sherpa-onnx/jni/offline-speaker-diarization.cc index a0eef8b9c..e82962c80 100644 --- a/sherpa-onnx/jni/offline-speaker-diarization.cc +++ b/sherpa-onnx/jni/offline-speaker-diarization.cc @@ -204,7 +204,8 @@ Java_com_k2fsa_sherpa_onnx_OfflineSpeakerDiarization_processWithCallback( jfloat *p = env->GetFloatArrayElements(samples, nullptr); jsize n = env->GetArrayLength(samples); auto segments = - sd->Process(p, n, callback_wrapper, (void *)arg).SortByStartTime(); + sd->Process(p, n, callback_wrapper, reinterpret_cast(arg)) + .SortByStartTime(); env->ReleaseFloatArrayElements(samples, p, JNI_ABORT); return ProcessImpl(env, segments);