Add C API for audio tagging (#754)

k2-fsa · Apr 11, 2024 · f204e62 · f204e62
1 parent 34d70a2
commit f204e62
Show file tree

Hide file tree

Showing 9 changed files with 285 additions and 32 deletions.
diff --git a/.github/scripts/test-c-api.sh b/.github/scripts/test-c-api.sh
@@ -10,8 +10,21 @@ log() {
 
 echo "SLID_EXE is $SLID_EXE"
 echo "SID_EXE is $SID_EXE"
+echo "AT_EXE is $AT_EXE"
 echo "PATH: $PATH"
 
+log "------------------------------------------------------------"
+log "Test audio tagging                                          "
+log "------------------------------------------------------------"
+
+curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
+tar xvf sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
+rm sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
+
+$AT_EXE
+
+rm -rf sherpa-onnx-zipformer-audio-tagging-2024-04-09
+
 
 log "------------------------------------------------------------"
 log "Download whisper tiny for spoken language identification    "

diff --git a/.github/workflows/linux.yaml b/.github/workflows/linux.yaml
@@ -126,6 +126,16 @@ jobs:
           name: release-${{ matrix.build_type }}-with-shared-lib-${{ matrix.shared_lib }}-with-tts-${{ matrix.with_tts }}
           path: build/bin/*
 
+      - name: Test C API
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export SLID_EXE=spoken-language-identification-c-api
+          export SID_EXE=speaker-identification-c-api
+          export AT_EXE=audio-tagging-c-api
+
+          .github/scripts/test-c-api.sh
+
       - name: Test Audio tagging
         shell: bash
         run: |
@@ -142,14 +152,6 @@ jobs:
 
           .github/scripts/test-online-ctc.sh
 
-      - name: Test C API
-        shell: bash
-        run: |
-          export PATH=$PWD/build/bin:$PATH
-          export SLID_EXE=spoken-language-identification-c-api
-          export SID_EXE=speaker-identification-c-api
-
-          .github/scripts/test-c-api.sh
 
       - name: Test spoken language identification (C++ API)
         shell: bash

diff --git a/.github/workflows/macos.yaml b/.github/workflows/macos.yaml
@@ -105,22 +105,23 @@ jobs:
           otool -L build/bin/sherpa-onnx
           otool -l build/bin/sherpa-onnx
 
-      - name: Test Audio tagging
+      - name: Test C API
         shell: bash
         run: |
           export PATH=$PWD/build/bin:$PATH
-          export EXE=sherpa-onnx-offline-audio-tagging
+          export SLID_EXE=spoken-language-identification-c-api
+          export SID_EXE=speaker-identification-c-api
+          export AT_EXE=audio-tagging-c-api
 
-          .github/scripts/test-audio-tagging.sh
+          .github/scripts/test-c-api.sh
 
-      - name: Test C API
+      - name: Test Audio tagging
         shell: bash
         run: |
           export PATH=$PWD/build/bin:$PATH
-          export SLID_EXE=spoken-language-identification-c-api
-          export SID_EXE=speaker-identification-c-api
+          export EXE=sherpa-onnx-offline-audio-tagging
 
-          .github/scripts/test-c-api.sh
+          .github/scripts/test-audio-tagging.sh
 
       - name: Test spoken language identification (C++ API)
         shell: bash

diff --git a/.github/workflows/windows-x64.yaml b/.github/workflows/windows-x64.yaml
@@ -72,22 +72,24 @@ jobs:
 
           ls -lh ./bin/Release/sherpa-onnx.exe
 
-      - name: Test Audio tagging
+      - name: Test C API
         shell: bash
         run: |
           export PATH=$PWD/build/bin/Release:$PATH
-          export EXE=sherpa-onnx-offline-audio-tagging.exe
+          export SLID_EXE=spoken-language-identification-c-api.exe
+          export SID_EXE=speaker-identification-c-api.exe
+          export AT_EXE=audio-tagging-c-api.exe
 
-          .github/scripts/test-audio-tagging.sh
+          .github/scripts/test-c-api.sh
 
-      - name: Test C API
+
+      - name: Test Audio tagging
         shell: bash
         run: |
           export PATH=$PWD/build/bin/Release:$PATH
-          export SLID_EXE=spoken-language-identification-c-api.exe
-          export SID_EXE=speaker-identification-c-api.exe
+          export EXE=sherpa-onnx-offline-audio-tagging.exe
 
-          .github/scripts/test-c-api.sh
+          .github/scripts/test-audio-tagging.sh
 
       - name: Test spoken language identification (C++ API)
         shell: bash

diff --git a/.github/workflows/windows-x86.yaml b/.github/workflows/windows-x86.yaml
@@ -77,6 +77,8 @@ jobs:
         run: |
           export PATH=$PWD/build/bin/Release:$PATH
           export SLID_EXE=spoken-language-identification-c-api.exe
+          export SID_EXE=speaker-identification-c-api.exe
+          export AT_EXE=audio-tagging-c-api.exe
 
           .github/scripts/test-c-api.sh
 

diff --git a/c-api-examples/CMakeLists.txt b/c-api-examples/CMakeLists.txt
@@ -18,6 +18,9 @@ target_link_libraries(speaker-identification-c-api sherpa-onnx-c-api)
 add_executable(streaming-hlg-decode-file-c-api streaming-hlg-decode-file-c-api.c)
 target_link_libraries(streaming-hlg-decode-file-c-api sherpa-onnx-c-api)
 
+add_executable(audio-tagging-c-api audio-tagging-c-api.c)
+target_link_libraries(audio-tagging-c-api sherpa-onnx-c-api)
+
 if(SHERPA_ONNX_HAS_ALSA)
   add_subdirectory(./asr-microphone-example)
 elseif((UNIX AND NOT APPLE) OR LINUX)

diff --git a/c-api-examples/audio-tagging-c-api.c b/c-api-examples/audio-tagging-c-api.c
@@ -0,0 +1,79 @@
+// c-api-examples/audio-tagging-c-api.c
+//
+// Copyright (c)  2024  Xiaomi Corporation
+
+// We assume you have pre-downloaded the model files for testing
+// from https://github.com/k2-fsa/sherpa-onnx/releases/tag/audio-tagging-models
+//
+// An example is given below:
+//
+// clang-format off
+//
+// wget https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
+// tar xvf sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
+// rm sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
+//
+// clang-format on
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "sherpa-onnx/c-api/c-api.h"
+
+int32_t main() {
+  SherpaOnnxAudioTaggingConfig config;
+  memset(&config, 0, sizeof(config));
+
+  config.model.zipformer.model =
+      "./sherpa-onnx-zipformer-audio-tagging-2024-04-09/model.int8.onnx";
+  config.model.num_threads = 1;
+  config.model.debug = 1;
+  config.model.provider = "cpu";
+  // clang-format off
+  config.labels = "./sherpa-onnx-zipformer-audio-tagging-2024-04-09/class_labels_indices.csv";
+  // clang-format on
+
+  const SherpaOnnxAudioTagging *tagger = SherpaOnnxCreateAudioTagging(&config);
+  if (!tagger) {
+    fprintf(stderr, "Failed to create audio tagger. Please check your config");
+    return -1;
+  }
+
+  // You can find more test waves from
+  // https://github.com/k2-fsa/sherpa-onnx/releases/download/audio-tagging-models/sherpa-onnx-zipformer-audio-tagging-2024-04-09.tar.bz2
+  const char *wav_filename =
+      "./sherpa-onnx-zipformer-audio-tagging-2024-04-09/test_wavs/1.wav";
+
+  const SherpaOnnxWave *wave = SherpaOnnxReadWave(wav_filename);
+  if (wave == NULL) {
+    fprintf(stderr, "Failed to read %s\n", wav_filename);
+    return -1;
+  }
+
+  const SherpaOnnxOfflineStream *stream =
+      SherpaOnnxAudioTaggingCreateOfflineStream(tagger);
+
+  AcceptWaveformOffline(stream, wave->sample_rate, wave->samples,
+                        wave->num_samples);
+
+  int32_t top_k = 5;
+  const SherpaOnnxAudioEvent *const *results =
+      SherpaOnnxAudioTaggingCompute(tagger, stream, top_k);
+
+  fprintf(stderr, "--------------------------------------------------\n");
+  fprintf(stderr, "Index\t\tProbability\t\tEvent name\n");
+  fprintf(stderr, "--------------------------------------------------\n");
+  for (int32_t i = 0; i != top_k; ++i) {
+    fprintf(stderr, "%d\t\t%.3f\t\t\t%s\n", i, results[i]->prob,
+            results[i]->name);
+  }
+  fprintf(stderr, "--------------------------------------------------\n");
+
+  SherpaOnnxAudioTaggingFreeResults(results);
+  DestroyOfflineStream(stream);
+  SherpaOnnxFreeWave(wave);
+  SherpaOnnxDestroyAudioTagging(tagger);
+
+  return 0;
+};
diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc
@@ -10,6 +10,7 @@
 #include <utility>
 #include <vector>
 
+#include "sherpa-onnx/csrc/audio-tagging.h"
 #include "sherpa-onnx/csrc/circular-buffer.h"
 #include "sherpa-onnx/csrc/display.h"
 #include "sherpa-onnx/csrc/keyword-spotter.h"
@@ -400,15 +401,18 @@ SherpaOnnxOfflineStream *CreateOfflineStream(
   return stream;
 }
 
-void DestroyOfflineStream(SherpaOnnxOfflineStream *stream) { delete stream; }
+void DestroyOfflineStream(const SherpaOnnxOfflineStream *stream) {
+  delete stream;
+}
 
-void AcceptWaveformOffline(SherpaOnnxOfflineStream *stream, int32_t sample_rate,
-                           const float *samples, int32_t n) {
+void AcceptWaveformOffline(const SherpaOnnxOfflineStream *stream,
+                           int32_t sample_rate, const float *samples,
+                           int32_t n) {
   stream->impl->AcceptWaveform(sample_rate, samples, n);
 }
 
-void DecodeOfflineStream(SherpaOnnxOfflineRecognizer *recognizer,
-                         SherpaOnnxOfflineStream *stream) {
+void DecodeOfflineStream(const SherpaOnnxOfflineRecognizer *recognizer,
+                         const SherpaOnnxOfflineStream *stream) {
   recognizer->impl->DecodeStream(stream->impl.get());
 }
 
@@ -1209,3 +1213,89 @@ void SherpaOnnxSpeakerEmbeddingManagerFreeAllSpeakers(
 
   delete[] names;
 }
+
+struct SherpaOnnxAudioTagging {
+  std::unique_ptr<sherpa_onnx::AudioTagging> impl;
+};
+
+const SherpaOnnxAudioTagging *SherpaOnnxCreateAudioTagging(
+    const SherpaOnnxAudioTaggingConfig *config) {
+  sherpa_onnx::AudioTaggingConfig ac;
+  ac.model.zipformer.model = SHERPA_ONNX_OR(config->model.zipformer.model, "");
+  ac.model.num_threads = SHERPA_ONNX_OR(config->model.num_threads, 1);
+  ac.model.debug = config->model.debug;
+  ac.model.provider = SHERPA_ONNX_OR(config->model.provider, "cpu");
+  ac.labels = SHERPA_ONNX_OR(config->labels, "");
+  ac.top_k = SHERPA_ONNX_OR(config->top_k, 5);
+
+  if (ac.model.debug) {
+    SHERPA_ONNX_LOGE("%s\n", ac.ToString().c_str());
+  }
+
+  if (!ac.Validate()) {
+    SHERPA_ONNX_LOGE("Errors in config");
+    return nullptr;
+  }
+
+  SherpaOnnxAudioTagging *tagger = new SherpaOnnxAudioTagging;
+  tagger->impl = std::make_unique<sherpa_onnx::AudioTagging>(ac);
+
+  return tagger;
+}
+
+void SherpaOnnxDestroyAudioTagging(const SherpaOnnxAudioTagging *tagger) {
+  delete tagger;
+}
+
+const SherpaOnnxOfflineStream *SherpaOnnxAudioTaggingCreateOfflineStream(
+    const SherpaOnnxAudioTagging *tagger) {
+  const SherpaOnnxOfflineStream *stream =
+      new SherpaOnnxOfflineStream(tagger->impl->CreateStream());
+  return stream;
+}
+
+const SherpaOnnxAudioEvent *const *SherpaOnnxAudioTaggingCompute(
+    const SherpaOnnxAudioTagging *tagger, const SherpaOnnxOfflineStream *s,
+    int32_t top_k) {
+  std::vector<sherpa_onnx::AudioEvent> events =
+      tagger->impl->Compute(s->impl.get(), top_k);
+
+  int32_t n = static_cast<int32_t>(events.size());
+  SherpaOnnxAudioEvent **ans = new SherpaOnnxAudioEvent *[n + 1];
+  ans[n] = nullptr;
+
+  int32_t i = 0;
+  for (const auto &e : events) {
+    SherpaOnnxAudioEvent *p = new SherpaOnnxAudioEvent;
+
+    char *name = new char[e.name.size() + 1];
+    std::copy(e.name.begin(), e.name.end(), name);
+    name[e.name.size()] = 0;
+
+    p->name = name;
+
+    p->index = e.index;
+    p->prob = e.prob;
+
+    ans[i] = p;
+    i += 1;
+  }
+
+  return ans;
+}
+
+void SherpaOnnxAudioTaggingFreeResults(
+    const SherpaOnnxAudioEvent *const *events) {
+  auto p = events;
+
+  while (p && *p) {
+    auto e = *p;
+
+    delete[] e->name;
+    delete e;
+
+    ++p;
+  }
+
+  delete[] events;
+}