Speaker diarization example with onnxruntime Python API (#1395)

k2-fsa · Oct 6, 2024 · 70165cb · 70165cb
1 parent 5f50cbf
commit 70165cb
Show file tree

Hide file tree

Showing 6 changed files with 719 additions and 1 deletion.
diff --git a/.github/workflows/speaker-diarization.yaml b/.github/workflows/speaker-diarization.yaml
@@ -0,0 +1,98 @@
+name: speaker-diarization
+
+on:
+  push:
+    branches:
+      - speaker-diarization
+  workflow_dispatch:
+
+concurrency:
+  group: speaker-diarization-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  linux:
+    name: speaker diarization
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [macos-latest]
+        python-version: ["3.10"]
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/[email protected]
+        with:
+          key: ${{ matrix.os }}-speaker-diarization
+
+      - name: Setup Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+
+      - name: Install pyannote
+        shell: bash
+        run: |
+          pip install pyannote.audio onnx onnxruntime
+
+      - name: Install sherpa-onnx from source
+        shell: bash
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install wheel twine setuptools
+
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+
+          cat sherpa-onnx/python/sherpa_onnx/__init__.py
+
+          python3 setup.py bdist_wheel
+          ls -lh dist
+          pip install ./dist/*.whl
+
+      - name: Run tests
+        shell: bash
+        run: |
+          pushd scripts/pyannote/segmentation
+
+          python3 -c "import sherpa_onnx; print(sherpa_onnx.__file__)"
+          python3 -c "import sherpa_onnx; print(sherpa_onnx.__version__)"
+          python3 -c "import sherpa_onnx; print(dir(sherpa_onnx))"
+
+          curl -SL -O https://huggingface.co/csukuangfj/pyannote-models/resolve/main/segmentation-3.0/pytorch_model.bin
+
+          test_wavs=(
+            0-two-speakers-zh.wav
+            1-two-speakers-en.wav
+            2-two-speakers-en.wav
+            3-two-speakers-en.wav
+          )
+
+          for w in ${test_wavs[@]}; do
+            curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/$w
+          done
+
+          soxi *.wav
+
+          curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-segmentation-models/sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+          tar xvf sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+          rm sherpa-onnx-pyannote-segmentation-3-0.tar.bz2
+          ls -lh sherpa-onnx-pyannote-segmentation-3-0
+
+          curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/speaker-recongition-models/3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx
+
+          for w in ${test_wavs[@]}; do
+            echo "---------test $w (onnx)----------"
+            time ./speaker-diarization-onnx.py \
+              --seg-model ./sherpa-onnx-pyannote-segmentation-3-0/model.onnx \
+              --speaker-embedding-model ./3dspeaker_speech_eres2net_base_sv_zh-cn_3dspeaker_16k.onnx \
+              --wav $w
+
+            echo "---------test $w (torch)----------"
+            time ./speaker-diarization-torch.py  --wav $w
+          done
diff --git a/.gitignore b/.gitignore
@@ -118,3 +118,5 @@ vits-melo-tts-zh_en
 *.o
 *.ppu
 sherpa-onnx-online-punct-en-2024-08-06
+*.mp4
+*.mp3
diff --git a/scripts/pyannote/segmentation/README.md b/scripts/pyannote/segmentation/README.md
@@ -0,0 +1,44 @@
+# File description
+
+Please download test wave files from
+https://github.com/k2-fsa/sherpa-onnx/releases/tag/speaker-segmentation-models
+
+## 0-two-speakers-zh.wav
+
+This file is from
+https://www.modelscope.cn/models/iic/speech_campplus_speaker-diarization_common/file/view/master?fileName=examples%252F2speakers_example.wav&status=0
+
+Note that we have renamed it from `2speakers_example.wav` to `0-two-speakers-zh.wav`.
+
+## 1-two-speakers-en.wav
+
+This file is from
+https://github.com/pengzhendong/pyannote-onnx/blob/master/data/test_16k.wav
+and it contains speeches from two speakers.
+
+Note that we have renamed it from `test_16k.wav` to `1-two-speakers-en.wav`
+
+
+## 2-two-speakers-en.wav
+This file is from
+https://huggingface.co/spaces/Xenova/whisper-speaker-diarization
+
+Note that the original file is `./fcf059e3-689f-47ec-a000-bdace87f0113.mp4`.
+We use the following commands to convert it to `2-two-speakers-en.wav`.
+
+```bash
+ffmpeg -i ./fcf059e3-689f-47ec-a000-bdace87f0113.mp4 -ac 1 -ar 16000 ./2-two-speakers-en.wav
+```
+
+## 3-two-speakers-en.wav
+
+This file is from
+https://aws.amazon.com/blogs/machine-learning/deploy-a-hugging-face-pyannote-speaker-diarization-model-on-amazon-sagemaker-as-an-asynchronous-endpoint/
+
+Note that the original file is `ML16091-Audio.mp3`. We use the following
+commands to convert it to `3-two-speakers-en.wav`
+
+
+```bash
+sox ML16091-Audio.mp3 3-two-speakers-en.wav
+```
-Original file line number
+Diff line change
@@ Expand Up / @@ -118,3 +118,5 @@ vits-melo-tts-zh_en @@
     *.o
     *.ppu
     sherpa-onnx-online-punct-en-2024-08-06
+    *.mp4
+    *.mp3