Skip to content

Commit

Permalink
Pascal API for speaker diarization (#1420)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Oct 12, 2024
1 parent 1ed803a commit 5e273c5
Show file tree
Hide file tree
Showing 5 changed files with 506 additions and 2 deletions.
15 changes: 15 additions & 0 deletions .github/workflows/pascal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,21 @@ jobs:
cp -v ../sherpa-onnx/pascal-api/*.pas ../pascal-api-examples/tts
fi
- name: Run Pascal test (Speaker diarization)
shell: bash
run: |
export PATH=/c/lazarus/fpc/3.2.2/bin/x86_64-win64:$PATH
cd ./pascal-api-examples
pushd speaker-diarization
./run.sh
rm -rfv *.onnx *.wav sherpa-onnx-*
ls -lh
echo "---"
popd
- name: Run Pascal test (TTS)
shell: bash
run: |
Expand Down
1 change: 1 addition & 0 deletions pascal-api-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ https://k2-fsa.github.io/sherpa/onnx/pascal-api/index.html
|Directory| Description|
|---------|------------|
|[read-wav](./read-wav)|It shows how to read a wave file.|
|[speaker-diarization](./speaker-diarization)|It shows how to use Pascal API for speaker diarization.|
|[streaming-asr](./streaming-asr)| It shows how to use streaming models for speech recognition.|
|[non-streaming-asr](./non-streaming-asr)| It shows how to use non-streaming models for speech recognition.|
|[vad](./vad)| It shows how to use the voice activity detection API.|
Expand Down
104 changes: 104 additions & 0 deletions pascal-api-examples/speaker-diarization/main.pas
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{ Copyright (c) 2024 Xiaomi Corporation }
{
This file shows how to use the Pascal API from sherpa-onnx
for speaker diarization.
Usage:
Step 1: Download a speaker segmentation model
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available models. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
Step 2: Download a speaker embedding extractor model
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-recongition-models
for a list of available models. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
Step 3. Download test wave files
Please visit https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
for a list of available test wave files. The following is an example
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
Step 4. Run it
}

program main;

{$mode delphi}

uses
sherpa_onnx,
ctypes,
SysUtils;

function ProgressCallback(
NumProcessedChunks: cint32;
NumTotalChunks: cint32): cint32; cdecl;
var
Progress: Single;
begin
Progress := 100.0 * NumProcessedChunks / NumTotalChunks;
WriteLn(Format('Progress: %.3f%%', [Progress]));

Result := 0;
end;

var
Wave: TSherpaOnnxWave;
Config: TSherpaOnnxOfflineSpeakerDiarizationConfig;
Sd: TSherpaOnnxOfflineSpeakerDiarization;
Segments: TSherpaOnnxOfflineSpeakerDiarizationSegmentArray;
I: Integer;
begin
Wave := SherpaOnnxReadWave('./0-four-speakers-zh.wav');

Config.Segmentation.Pyannote.Model := './sherpa-onnx-pyannote-segmentation-3-0/model.onnx';
Config.Embedding.Model := './3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx';

{
Since we know that there are 4 speakers in ./0-four-speakers-zh.wav, we
set NumClusters to 4 here.
If you don't have such information, please set NumClusters to -1.
In that case, you have to set Config.Clustering.Threshold.
A larger threshold leads to fewer clusters, i.e., fewer speakers.
}
Config.Clustering.NumClusters := 4;
Config.Segmentation.Debug := True;
Config.Embedding.Debug := True;

Sd := TSherpaOnnxOfflineSpeakerDiarization.Create(Config);
if Sd.GetHandle = nil then
begin
WriteLn('Please check you config');
Exit;
end;

if Sd.GetSampleRate <> Wave.SampleRate then
begin
WriteLn(Format('Expected sample rate: %d, given: %d', [Sd.GetSampleRate, Wave.SampleRate]));
Exit;
end;

{
// If you don't want to use a callback
Segments := Sd.Process(Wave.Samples);
}
Segments := Sd.Process(Wave.Samples, @ProgressCallback);

for I := Low(Segments) to High(Segments) do
begin
WriteLn(Format('%.3f -- %.3f speaker_%d',
[Segments[I].Start, Segments[I].Stop, Segments[I].Speaker]));
end;

FreeAndNil(Sd);
end.
49 changes: 49 additions & 0 deletions pascal-api-examples/speaker-diarization/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#!/usr/bin/env bash

set -ex

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
SHERPA_ONNX_DIR=$(cd $SCRIPT_DIR/../.. && pwd)

echo "SHERPA_ONNX_DIR: $SHERPA_ONNX_DIR"

if [[ ! -f ../../build/install/lib/libsherpa-onnx-c-api.dylib && ! -f ../../build/install/lib/libsherpa-onnx-c-api.so && ! -f ../../build/install/lib/sherpa-onnx-c-api.dll ]]; then
mkdir -p ../../build
pushd ../../build
cmake \
-DCMAKE_INSTALL_PREFIX=./install \
-DSHERPA_ONNX_ENABLE_PYTHON=OFF \
-DSHERPA_ONNX_ENABLE_TESTS=OFF \
-DSHERPA_ONNX_ENABLE_CHECK=OFF \
-DBUILD_SHARED_LIBS=ON \
-DSHERPA_ONNX_ENABLE_PORTAUDIO=OFF \
..

cmake --build . --target install --config Release
popd
fi

fpc \
-dSHERPA_ONNX_USE_SHARED_LIBS \
-Fu$SHERPA_ONNX_DIR/sherpa-onnx/pascal-api \
-Fl$SHERPA_ONNX_DIR/build/install/lib \
./main.pas

export LD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$LD_LIBRARY_PATH
export DYLD_LIBRARY_PATH=$SHERPA_ONNX_DIR/build/install/lib:$DYLD_LIBRARY_PATH

if [ ! -f ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
fi

if [ ! -f ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
fi

if [ ! -f ./0-four-speakers-zh.wav ]; then
curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/0-four-speakers-zh.wav
fi

./main
Loading

0 comments on commit 5e273c5

Please sign in to comment.