Support spoken language identification with whisper (#694)

k2-fsa · Mar 24, 2024 · 0d258dd · 0d258dd
1 parent 3cdad9b
commit 0d258dd
Show file tree

Hide file tree

Showing 36 changed files with 1,173 additions and 200 deletions.
diff --git a/.github/scripts/test-spoken-language-identification.sh b/.github/scripts/test-spoken-language-identification.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+
+set -e
+
+log() {
+  # This function is from espnet
+  local fname=${BASH_SOURCE[1]##*/}
+  echo -e "$(date '+%Y-%m-%d %H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+
+echo "EXE is $EXE"
+echo "PATH: $PATH"
+
+which $EXE
+
+names=(
+tiny
+base
+small
+medium
+)
+
+# all_language_codes=bo,ml,tt,fa,sl,bg,sn,sr,tl,km,ln,mr,hr,eu,ro,ba,bs,pl,as,nn,sk,ko,oc,ar,uz,pa,tg,mk,kk,hi,ha,uk,is,de,el,ja,yo,be,so,tk,id,sa,ru,yi,en,am,cs,ne,la,sv,su,pt,mi,ca,sd,hy,haw,fi,et,kn,da,lt,it,nl,he,mg,ur,tr,af,br,bn,ta,no,my,si,mt,th,gl,sw,mn,jw,ms,ps,fo,ka,hu,zh,ht,az,fr,lo,sq,gu,cy,lv,es,lb,te,vi
+
+log "Download test waves"
+waves=(
+ar-arabic.wav
+bg-bulgarian.wav
+cs-czech.wav
+da-danish.wav
+de-german.wav
+el-greek.wav
+en-english.wav
+es-spanish.wav
+fa-persian.wav
+fi-finnish.wav
+fr-french.wav
+hi-hindi.wav
+hr-croatian.wav
+id-indonesian.wav
+it-italian.wav
+ja-japanese.wav
+ko-korean.wav
+nl-dutch.wav
+no-norwegian.wav
+po-polish.wav
+pt-portuguese.wav
+ro-romanian.wav
+ru-russian.wav
+sk-slovak.wav
+sv-swedish.wav
+ta-tamil.wav
+tl-tagalog.wav
+tr-turkish.wav
+uk-ukrainian.wav
+zh-chinese.wav
+)
+
+for wav in ${waves[@]}; do
+  echo "Downloading $wav"
+  curl -SL -O https://hf-mirror.com/spaces/k2-fsa/spoken-language-identification/resolve/main/test_wavs/$wav
+  ls -lh *.wav
+done
+
+for name in ${names[@]}; do
+  log "------------------------------------------------------------"
+  log "Run $name"
+  log "------------------------------------------------------------"
+
+  repo_url=https://huggingface.co/csukuangfj/sherpa-onnx-whisper-$name
+  log "Start testing ${repo_url}"
+  repo=$(basename $repo_url)
+  log "Download pretrained model and test-data from $repo_url"
+
+  GIT_LFS_SKIP_SMUDGE=1 git clone $repo_url
+  pushd $repo
+  git lfs pull --include "*.onnx"
+  # git lfs pull --include "*.ort"
+  ls -lh *.onnx
+  popd
+
+  for wav in ${waves[@]}; do
+    log "test fp32 onnx"
+
+    time $EXE \
+      --whisper-encoder=$repo/${name}-encoder.onnx \
+      --whisper-decoder=$repo/${name}-decoder.onnx \
+      $wav
+
+    log "test int8 onnx"
+
+    time $EXE \
+      --whisper-encoder=$repo/${name}-encoder.int8.onnx \
+      --whisper-decoder=$repo/${name}-decoder.int8.onnx \
+      $wav
+  done
+  rm -rf $repo
+done
diff --git a/.github/workflows/build-wheels-linux.yaml b/.github/workflows/build-wheels-linux.yaml
@@ -82,7 +82,6 @@ jobs:
         env:
           HF_TOKEN: ${{ secrets.HF_TOKEN }}
         uses: nick-fields/retry@v3
-        shell: bash
         with:
           max_attempts: 20
           timeout_seconds: 200

diff --git a/.github/workflows/build-wheels-macos-arm64.yaml b/.github/workflows/build-wheels-macos-arm64.yaml
@@ -21,27 +21,12 @@ jobs:
       fail-fast: false
       matrix:
         os: [macos-latest]
-        python-version: ["cp37", "cp38", "cp39", "cp310", "cp311", "cp312"]
+        python-version: ["cp38", "cp39", "cp310", "cp311", "cp312"]
 
     steps:
       - uses: actions/checkout@v4
 
-      # see https://cibuildwheel.readthedocs.io/en/stable/changelog/
-      # for a list of versions
       - name: Build wheels
-        if: matrix.python-version == 'cp37'
-        uses: pypa/[email protected]
-        env:
-          CIBW_BUILD: "${{ matrix.python-version}}-* "
-          CIBW_ENVIRONMENT: SHERPA_ONNX_CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES='arm64'"
-          CIBW_ARCHS: "arm64"
-          CIBW_BUILD_VERBOSITY: 3
-
-          #  Don't repair macOS wheels
-          CIBW_REPAIR_WHEEL_COMMAND_MACOS: ""
-
-      - name: Build wheels
-        if: matrix.python-version != 'cp37'
         uses: pypa/[email protected]
         env:
           CIBW_BUILD: "${{ matrix.python-version}}-* "

diff --git a/.github/workflows/linux-gpu.yaml b/.github/workflows/linux-gpu.yaml
@@ -92,6 +92,14 @@ jobs:
           file build/bin/sherpa-onnx
           readelf -d build/bin/sherpa-onnx
 
+      - name: Test spoken language identification
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx-offline-language-identification
+
+          .github/scripts/test-spoken-language-identification.sh
+
       - name: Test online CTC
         shell: bash
         run: |
@@ -116,6 +124,7 @@ jobs:
 
           .github/scripts/test-online-paraformer.sh
 
+
       - name: Test offline Whisper
         shell: bash
         run: |

diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml
@@ -123,6 +123,15 @@ jobs:
           name: release-${{ matrix.build_type }}-${{ matrix.shared_lib }}
           path: build/bin/*
 
+      - name: Test spoken language identification
+        if: matrix.build_type != 'Debug'
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx-offline-language-identification
+
+          .github/scripts/test-spoken-language-identification.sh
+
       - name: Test transducer kws
         shell: bash
         run: |
@@ -140,6 +149,7 @@ jobs:
           .github/scripts/test-online-ctc.sh
 
       - name: Test offline Whisper
+        if: matrix.build_type != 'Debug'
         shell: bash
         run: |
           export PATH=$PWD/build/bin:$PATH

diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml
@@ -102,6 +102,15 @@ jobs:
           otool -L build/bin/sherpa-onnx
           otool -l build/bin/sherpa-onnx
 
+      - name: Test spoken language identification
+        if: matrix.build_type != 'Debug'
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx-offline-language-identification
+
+          .github/scripts/test-spoken-language-identification.sh
+
       - name: Test transducer kws
         shell: bash
         run: |
@@ -135,6 +144,7 @@ jobs:
           .github/scripts/test-online-paraformer.sh
 
       - name: Test offline Whisper
+        if: matrix.build_type != 'Debug'
         shell: bash
         run: |
           export PATH=$PWD/build/bin:$PATH

diff --git a/.github/workflows/windows-x64-cuda.yaml b/.github/workflows/windows-x64-cuda.yaml
@@ -68,6 +68,14 @@ jobs:
 
           ls -lh ./bin/Release/sherpa-onnx.exe
 
+      - name: Test spoken language identification
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin/Release:$PATH
+          export EXE=sherpa-onnx-offline-language-identification.exe
+
+          .github/scripts/test-spoken-language-identification.sh
+
       - name: Test online CTC
         shell: bash
         run: |

diff --git a/.github/workflows/windows-x64.yaml b/.github/workflows/windows-x64.yaml
@@ -68,6 +68,14 @@ jobs:
 
           ls -lh ./bin/Release/sherpa-onnx.exe
 
+      - name: Test spoken language identification
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin/Release:$PATH
+          export EXE=sherpa-onnx-offline-language-identification.exe
+
+          .github/scripts/test-spoken-language-identification.sh
+
       - name: Test online CTC
         shell: bash
         run: |

diff --git a/.github/workflows/windows-x86.yaml b/.github/workflows/windows-x86.yaml
@@ -69,6 +69,14 @@ jobs:
 
           ls -lh ./bin/Release/sherpa-onnx.exe
 
+      # - name: Test spoken language identification
+      #   shell: bash
+      #   run: |
+      #     export PATH=$PWD/build/bin/Release:$PATH
+      #     export EXE=sherpa-onnx-offline-language-identification.exe
+      #
+      #     .github/scripts/test-spoken-language-identification.sh
+
       - name: Test online CTC
         shell: bash
         run: |

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.13 FATAL_ERROR)
 project(sherpa-onnx)
 
-set(SHERPA_ONNX_VERSION "1.9.13")
+set(SHERPA_ONNX_VERSION "1.9.14")
 
 # Disable warning about
 #

diff --git a/cmake/cmake_extension.py b/cmake/cmake_extension.py
@@ -43,6 +43,50 @@ def enable_alsa():
     return build_alsa and is_linux() and (is_arm64() or is_x86())
 
 
+def get_binaries():
+    binaries = [
+        "sherpa-onnx",
+        "sherpa-onnx-keyword-spotter",
+        "sherpa-onnx-microphone",
+        "sherpa-onnx-microphone-offline",
+        "sherpa-onnx-microphone-offline-speaker-identification",
+        "sherpa-onnx-offline",
+        "sherpa-onnx-offline-language-identification",
+        "sherpa-onnx-offline-tts",
+        "sherpa-onnx-offline-tts-play",
+        "sherpa-onnx-offline-websocket-server",
+        "sherpa-onnx-online-websocket-client",
+        "sherpa-onnx-online-websocket-server",
+        "sherpa-onnx-vad-microphone",
+        "sherpa-onnx-vad-microphone-offline-asr",
+    ]
+
+    if enable_alsa():
+        binaries += [
+            "sherpa-onnx-alsa",
+            "sherpa-onnx-alsa-offline",
+            "sherpa-onnx-alsa-offline-speaker-identification",
+            "sherpa-onnx-offline-tts-play-alsa",
+        ]
+
+    if is_windows():
+        binaries += [
+            "espeak-ng.dll",
+            "kaldi-decoder-core.dll",
+            "kaldi-native-fbank-core.dll",
+            "onnxruntime.dll",
+            "piper_phonemize.dll",
+            "sherpa-onnx-c-api.dll",
+            "sherpa-onnx-core.dll",
+            "sherpa-onnx-fst.lib",
+            "sherpa-onnx-kaldifst-core.lib",
+            "sherpa-onnx-portaudio.dll",
+            "ucd.dll",
+        ]
+
+    return binaries
+
+
 try:
     from wheel.bdist_wheel import bdist_wheel as _bdist_wheel
 
@@ -150,38 +194,7 @@ def build_extension(self, ext: setuptools.extension.Extension):
         suffix = ".exe" if is_windows() else ""
         # Remember to also change setup.py
 
-        binaries = ["sherpa-onnx"]
-        binaries += ["sherpa-onnx-keyword-spotter"]
-        binaries += ["sherpa-onnx-offline"]
-        binaries += ["sherpa-onnx-microphone"]
-        binaries += ["sherpa-onnx-microphone-offline"]
-        binaries += ["sherpa-onnx-microphone-offline-speaker-identification"]
-        binaries += ["sherpa-onnx-online-websocket-server"]
-        binaries += ["sherpa-onnx-offline-websocket-server"]
-        binaries += ["sherpa-onnx-online-websocket-client"]
-        binaries += ["sherpa-onnx-vad-microphone"]
-        binaries += ["sherpa-onnx-vad-microphone-offline-asr"]
-        binaries += ["sherpa-onnx-offline-tts"]
-        binaries += ["sherpa-onnx-offline-tts-play"]
-
-        if enable_alsa():
-            binaries += ["sherpa-onnx-alsa"]
-            binaries += ["sherpa-onnx-alsa-offline"]
-            binaries += ["sherpa-onnx-offline-tts-play-alsa"]
-            binaries += ["sherpa-onnx-alsa-offline-speaker-identification"]
-
-        if is_windows():
-            binaries += ["kaldi-native-fbank-core.dll"]
-            binaries += ["sherpa-onnx-c-api.dll"]
-            binaries += ["sherpa-onnx-core.dll"]
-            binaries += ["sherpa-onnx-portaudio.dll"]
-            binaries += ["onnxruntime.dll"]
-            binaries += ["piper_phonemize.dll"]
-            binaries += ["espeak-ng.dll"]
-            binaries += ["ucd.dll"]
-            binaries += ["kaldi-decoder-core.dll"]
-            binaries += ["sherpa-onnx-fst.lib"]
-            binaries += ["sherpa-onnx-kaldifst-core.lib"]
+        binaries = get_binaries()
 
         for f in binaries:
             suffix = "" if (".dll" in f or ".lib" in f) else suffix