From 16007ee14b1e1d6142c30b80ac423cf49b29a465 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Thu, 11 Jul 2024 20:27:50 +0800
Subject: [PATCH 1/5] Add cmake support for linux rocm onnxruntime lib

---
 CMakeLists.txt                            |  19 +++-
 cmake/onnxruntime-linux-x86_64-rocm.cmake | 101 ++++++++++++++++++++++
 cmake/onnxruntime.cmake                   |   2 +
 sherpa-onnx/csrc/CMakeLists.txt           |   7 ++
 4 files changed, 127 insertions(+), 2 deletions(-)
 create mode 100644 cmake/onnxruntime-linux-x86_64-rocm.cmake

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 203b8a569f..aad04ef66b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,7 +29,8 @@ option(SHERPA_ONNX_ENABLE_PORTAUDIO "Whether to build with portaudio" ON)
 option(SHERPA_ONNX_ENABLE_JNI "Whether to build JNI internface" OFF)
 option(SHERPA_ONNX_ENABLE_C_API "Whether to build C API" ON)
 option(SHERPA_ONNX_ENABLE_WEBSOCKET "Whether to build webscoket server/client" ON)
-option(SHERPA_ONNX_ENABLE_GPU "Enable ONNX Runtime GPU support" OFF)
+option(SHERPA_ONNX_ENABLE_GPU "Enable ONNX Runtime NVIDIA GPU support" OFF)
+option(SHERPA_ONNX_ENABLE_ROCM "Enable ONNX Runtime AMD GPU support" OFF)
 option(SHERPA_ONNX_ENABLE_WASM "Whether to enable WASM" OFF)
 option(SHERPA_ONNX_ENABLE_WASM_TTS "Whether to enable WASM for TTS" OFF)
 option(SHERPA_ONNX_ENABLE_WASM_ASR "Whether to enable WASM for ASR" OFF)
@@ -82,6 +83,10 @@ if(SHERPA_ONNX_ENABLE_JNI AND NOT BUILD_SHARED_LIBS)
 endif()
 
 if(SHERPA_ONNX_ENABLE_GPU)
+  if(SHERPA_ONNX_ENABLE_ROCM)
+    message(FATAL_ERROR "Both SHERPA_ONNX_ENABLE_GPU and SHERPA_ONNX_ENABLE_ROCM are ON. Please set at most one of them to ON.")
+  endif()
+
   message(WARNING "\
 Compiling for NVIDIA GPU is enabled. Please make sure cudatoolkit
 is installed on your system. Otherwise, you will get errors at runtime.
@@ -89,7 +94,16 @@ Hint: You don't need sudo permission to install CUDA toolkit. Please refer to
   https://k2-fsa.github.io/k2/installation/cuda-cudnn.html
 to install CUDA toolkit if you have not installed it.")
   if(NOT BUILD_SHARED_LIBS)
-    message(STATUS "Set BUILD_SHARED_LIBS to ON since SHERPA_ONNX_ENABLE_GPU is ON")
+    message(STATUS "Set BUILD_SHARED_LIBS to ON since SHERPA_ONNX_ENABLE_GPU is ON") set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE)
+  endif()
+endif()
+
+if(SHERPA_ONNX_ENABLE_ROCM)
+  message(WARNING "\
+Compiling for AMD GPU is enabled. Please make sure ROCm
+is installed on your system. Otherwise, you will get errors at runtime.")
+  if(NOT BUILD_SHARED_LIBS)
+    message(STATUS "Set BUILD_SHARED_LIBS to ON since SHERPA_ONNX_ENABLE_ROCM is ON")
     set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE)
   endif()
 endif()
@@ -117,6 +131,7 @@ message(STATUS "SHERPA_ONNX_ENABLE_JNI ${SHERPA_ONNX_ENABLE_JNI}")
 message(STATUS "SHERPA_ONNX_ENABLE_C_API ${SHERPA_ONNX_ENABLE_C_API}")
 message(STATUS "SHERPA_ONNX_ENABLE_WEBSOCKET ${SHERPA_ONNX_ENABLE_WEBSOCKET}")
 message(STATUS "SHERPA_ONNX_ENABLE_GPU ${SHERPA_ONNX_ENABLE_GPU}")
+message(STATUS "SHERPA_ONNX_ENABLE_ROCM ${SHERPA_ONNX_ENABLE_ROCM}")
 message(STATUS "SHERPA_ONNX_ENABLE_WASM ${SHERPA_ONNX_ENABLE_WASM}")
 message(STATUS "SHERPA_ONNX_ENABLE_WASM_TTS ${SHERPA_ONNX_ENABLE_WASM_TTS}")
 message(STATUS "SHERPA_ONNX_ENABLE_WASM_ASR ${SHERPA_ONNX_ENABLE_WASM_ASR}")
diff --git a/cmake/onnxruntime-linux-x86_64-rocm.cmake b/cmake/onnxruntime-linux-x86_64-rocm.cmake
new file mode 100644
index 0000000000..d51c2f7277
--- /dev/null
+++ b/cmake/onnxruntime-linux-x86_64-rocm.cmake
@@ -0,0 +1,101 @@
+# Copyright (c)  2022-2023  Xiaomi Corporation
+message(STATUS "CMAKE_SYSTEM_NAME: ${CMAKE_SYSTEM_NAME}")
+message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
+
+if(NOT CMAKE_SYSTEM_NAME STREQUAL Linux)
+  message(FATAL_ERROR "This file is for Linux only. Given: ${CMAKE_SYSTEM_NAME}")
+endif()
+
+if(NOT CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64)
+  message(FATAL_ERROR "This file is for x86_64 only. Given: ${CMAKE_SYSTEM_PROCESSOR}")
+endif()
+
+if(NOT BUILD_SHARED_LIBS)
+  message(FATAL_ERROR "This file is for building shared libraries. BUILD_SHARED_LIBS: ${BUILD_SHARED_LIBS}")
+endif()
+
+if(NOT SHERPA_ONNX_ENABLE_ROCM)
+  message(FATAL_ERROR "This file is for AMD GPU only. Given SHERPA_ONNX_ENABLE_ROCM: ${SHERPA_ONNX_ENABLE_ROCM}")
+endif()
+
+set(onnxruntime_URL  "https://github.com/csukuangfj/onnxruntime-libs/releases/download/v1.18.1/onnxruntime-linux-x64-rocm-Release-1.18.1.zip")
+set(onnxruntime_URL2 "https://hub.nuaa.cf/csukuangfj/onnxruntime-libs/releases/download/v1.18.1/onnxruntime-linux-x64-rocm-Release-1.18.1.zip")
+set(onnxruntime_HASH "SHA256=fe6674d0d4d72d9361667de2ada44a81591c3769c63a87421636317590be659e")
+
+# If you don't have access to the Internet,
+# please download onnxruntime to one of the following locations.
+# You can add more if you want.
+set(possible_file_locations
+  $ENV{HOME}/Downloads/onnxruntime-linux-x64-rocm-Release-1.18.1.zip
+  ${CMAKE_SOURCE_DIR}/onnxruntime-linux-x64-rocm-Release-1.18.1.zip
+  ${CMAKE_BINARY_DIR}/onnxruntime-linux-x64-rocm-Release-1.18.1.zip
+  /tmp/onnxruntime-linux-x64-rocm-Release-1.18.1.zip
+  /star-fj/fangjun/download/github/onnxruntime-linux-x64-rocm-Release-1.18.1.zip
+)
+
+foreach(f IN LISTS possible_file_locations)
+  if(EXISTS ${f})
+    set(onnxruntime_URL  "${f}")
+    file(TO_CMAKE_PATH "${onnxruntime_URL}" onnxruntime_URL)
+    message(STATUS "Found local downloaded onnxruntime: ${onnxruntime_URL}")
+    set(onnxruntime_URL2)
+    break()
+  endif()
+endforeach()
+
+FetchContent_Declare(onnxruntime
+  URL
+    ${onnxruntime_URL}
+    ${onnxruntime_URL2}
+  URL_HASH          ${onnxruntime_HASH}
+)
+
+FetchContent_GetProperties(onnxruntime)
+if(NOT onnxruntime_POPULATED)
+  message(STATUS "Downloading onnxruntime from ${onnxruntime_URL}")
+  FetchContent_Populate(onnxruntime)
+endif()
+message(STATUS "onnxruntime is downloaded to ${onnxruntime_SOURCE_DIR}")
+
+find_library(location_onnxruntime onnxruntime
+  PATHS
+  "${onnxruntime_SOURCE_DIR}/lib"
+  NO_CMAKE_SYSTEM_PATH
+)
+
+message(STATUS "location_onnxruntime: ${location_onnxruntime}")
+
+add_library(onnxruntime SHARED IMPORTED)
+
+set_target_properties(onnxruntime PROPERTIES
+  IMPORTED_LOCATION ${location_onnxruntime}
+  INTERFACE_INCLUDE_DIRECTORIES "${onnxruntime_SOURCE_DIR}/include"
+)
+
+find_library(location_onnxruntime_rocm_lib onnxruntime_providers_rocm
+  PATHS
+  "${onnxruntime_SOURCE_DIR}/lib"
+  NO_CMAKE_SYSTEM_PATH
+)
+
+add_library(onnxruntime_providers_rocm SHARED IMPORTED)
+set_target_properties(onnxruntime_providers_rocm PROPERTIES
+  IMPORTED_LOCATION ${location_onnxruntime_rocm_lib}
+)
+message(STATUS "location_onnxruntime_rocm_lib: ${location_onnxruntime_rocm_lib}")
+
+# for libonnxruntime_providers_shared.so
+find_library(location_onnxruntime_providers_shared_lib onnxruntime_providers_shared
+  PATHS
+  "${onnxruntime_SOURCE_DIR}/lib"
+  NO_CMAKE_SYSTEM_PATH
+)
+add_library(onnxruntime_providers_shared SHARED IMPORTED)
+set_target_properties(onnxruntime_providers_shared PROPERTIES
+  IMPORTED_LOCATION ${location_onnxruntime_providers_shared_lib}
+)
+message(STATUS "location_onnxruntime_providers_shared_lib: ${location_onnxruntime_providers_shared_lib}")
+
+file(GLOB onnxruntime_lib_files "${onnxruntime_SOURCE_DIR}/lib/libonnxruntime*")
+message(STATUS "onnxruntime lib files: ${onnxruntime_lib_files}")
+install(FILES ${onnxruntime_lib_files} DESTINATION lib)
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index d1c4dc851f..fc746b1a0c 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -27,6 +27,8 @@ function(download_onnxruntime)
   elseif(CMAKE_SYSTEM_NAME STREQUAL Linux AND CMAKE_SYSTEM_PROCESSOR STREQUAL x86_64)
     if(SHERPA_ONNX_ENABLE_GPU)
       include(onnxruntime-linux-x86_64-gpu)
+    elseif(SHERPA_ONNX_ENABLE_ROCM)
+      include(onnxruntime-linux-x86_64-rocm)
     elseif(BUILD_SHARED_LIBS)
       include(onnxruntime-linux-x86_64)
     else()
diff --git a/sherpa-onnx/csrc/CMakeLists.txt b/sherpa-onnx/csrc/CMakeLists.txt
index b6bda8ba9b..12ece567aa 100644
--- a/sherpa-onnx/csrc/CMakeLists.txt
+++ b/sherpa-onnx/csrc/CMakeLists.txt
@@ -190,6 +190,13 @@ if(SHERPA_ONNX_ENABLE_GPU)
   )
 endif()
 
+if(SHERPA_ONNX_ENABLE_ROCM)
+  target_link_libraries(sherpa-onnx-core
+    onnxruntime_providers_rocm
+    onnxruntime_providers_shared
+  )
+endif()
+
 if(BUILD_SHARED_LIBS)
   target_link_libraries(sherpa-onnx-core onnxruntime)
 else()

From 706f939ca217ac5f9099a973b08a59659fa8e4d5 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Thu, 11 Jul 2024 20:36:22 +0800
Subject: [PATCH 2/5] Add C++ code for ROCM execution provider

---
 sherpa-onnx/csrc/provider.cc |  2 ++
 sherpa-onnx/csrc/provider.h  |  3 +-
 sherpa-onnx/csrc/session.cc  | 67 ++++++++++++++++++++++--------------
 3 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/sherpa-onnx/csrc/provider.cc b/sherpa-onnx/csrc/provider.cc
index 19d5859765..80fc5a671e 100644
--- a/sherpa-onnx/csrc/provider.cc
+++ b/sherpa-onnx/csrc/provider.cc
@@ -26,6 +26,8 @@ Provider StringToProvider(std::string s) {
     return Provider::kNNAPI;
   } else if (s == "trt") {
     return Provider::kTRT;
+  } else if (s == "rocm") {
+    return Provider::kRocm;
   } else {
     SHERPA_ONNX_LOGE("Unsupported string: %s. Fallback to cpu", s.c_str());
     return Provider::kCPU;
diff --git a/sherpa-onnx/csrc/provider.h b/sherpa-onnx/csrc/provider.h
index 712006f2b7..8f03e695e0 100644
--- a/sherpa-onnx/csrc/provider.h
+++ b/sherpa-onnx/csrc/provider.h
@@ -19,7 +19,8 @@ enum class Provider {
   kCoreML = 2,   // CoreMLExecutionProvider
   kXnnpack = 3,  // XnnpackExecutionProvider
   kNNAPI = 4,    // NnapiExecutionProvider
-  kTRT = 5,     // TensorRTExecutionProvider
+  kTRT = 5,      // TensorRTExecutionProvider
+  kRocm = 6,     // ROCMExecutionProvider
 };
 
 /**
diff --git a/sherpa-onnx/csrc/session.cc b/sherpa-onnx/csrc/session.cc
index b6fdaaa84e..b303614809 100644
--- a/sherpa-onnx/csrc/session.cc
+++ b/sherpa-onnx/csrc/session.cc
@@ -31,8 +31,8 @@ static void OrtStatusFailure(OrtStatus *status, const char *s) {
   api.ReleaseStatus(status);
 }
 
-static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads,
-    const std::string &provider_str,
+static Ort::SessionOptions GetSessionOptionsImpl(
+    int32_t num_threads, const std::string &provider_str,
     const ProviderConfig *provider_config = nullptr) {
   Provider p = StringToProvider(provider_str);
 
@@ -60,15 +60,32 @@ static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads,
                     "XnnpackExecutionProvider") != available_providers.end()) {
         sess_opts.AppendExecutionProvider("XNNPACK");
       } else {
-        SHERPA_ONNX_LOGE("Available providers: %s. Fallback to cpu!",
-                         os.str().c_str());
+        SHERPA_ONNX_LOGE(
+            "Available providers: %s. "
+            "XNNPACK is not supported. Fallback to cpu!",
+            os.str().c_str());
+      }
+      break;
+    }
+    case Provider::kRocm: {
+      if (std::find(available_providers.begin(), available_providers.end(),
+                    "ROCMExecutionProvider") != available_providers.end()) {
+        OrtROCMProviderOptions options;
+        options.device_id = 0;
+        sess_opts.AppendExecutionProvider_ROCM(options);
+      } else {
+        SHERPA_ONNX_LOGE(
+            "Available providers: %s."
+            "ROCM is not supported. Fallback to cpu!",
+            os.str().c_str());
       }
       break;
     }
     case Provider::kTRT: {
       if (provider_config == nullptr) {
-        SHERPA_ONNX_LOGE("Tensorrt support for Online models ony,"
-                         "Must be extended for offline and others");
+        SHERPA_ONNX_LOGE(
+            "TensorRT support is for Online models only,"
+            "Must be extended for offline and others");
         exit(1);
       }
       auto trt_config = provider_config->trt_config;
@@ -84,30 +101,28 @@ static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads,
           std::to_string(trt_config.trt_max_partition_iterations);
       auto trt_min_subgraph_size =
           std::to_string(trt_config.trt_min_subgraph_size);
-      auto trt_fp16_enable =
-          std::to_string(trt_config.trt_fp16_enable);
+      auto trt_fp16_enable = std::to_string(trt_config.trt_fp16_enable);
       auto trt_detailed_build_log =
           std::to_string(trt_config.trt_detailed_build_log);
       auto trt_engine_cache_enable =
           std::to_string(trt_config.trt_engine_cache_enable);
       auto trt_timing_cache_enable =
           std::to_string(trt_config.trt_timing_cache_enable);
-      auto trt_dump_subgraphs =
-          std::to_string(trt_config.trt_dump_subgraphs);
+      auto trt_dump_subgraphs = std::to_string(trt_config.trt_dump_subgraphs);
 
       std::vector<TrtPairs> trt_options = {
-        {"device_id", device_id.c_str()},
-        {"trt_max_workspace_size", trt_max_workspace_size.c_str()},
-        {"trt_max_partition_iterations", trt_max_partition_iterations.c_str()},
-        {"trt_min_subgraph_size", trt_min_subgraph_size.c_str()},
-        {"trt_fp16_enable", trt_fp16_enable.c_str()},
-        {"trt_detailed_build_log", trt_detailed_build_log.c_str()},
-        {"trt_engine_cache_enable", trt_engine_cache_enable.c_str()},
-        {"trt_engine_cache_path", trt_config.trt_engine_cache_path.c_str()},
-        {"trt_timing_cache_enable", trt_timing_cache_enable.c_str()},
-        {"trt_timing_cache_path", trt_config.trt_timing_cache_path.c_str()},
-        {"trt_dump_subgraphs", trt_dump_subgraphs.c_str()}
-      };
+          {"device_id", device_id.c_str()},
+          {"trt_max_workspace_size", trt_max_workspace_size.c_str()},
+          {"trt_max_partition_iterations",
+           trt_max_partition_iterations.c_str()},
+          {"trt_min_subgraph_size", trt_min_subgraph_size.c_str()},
+          {"trt_fp16_enable", trt_fp16_enable.c_str()},
+          {"trt_detailed_build_log", trt_detailed_build_log.c_str()},
+          {"trt_engine_cache_enable", trt_engine_cache_enable.c_str()},
+          {"trt_engine_cache_path", trt_config.trt_engine_cache_path.c_str()},
+          {"trt_timing_cache_enable", trt_timing_cache_enable.c_str()},
+          {"trt_timing_cache_path", trt_config.trt_timing_cache_path.c_str()},
+          {"trt_dump_subgraphs", trt_dump_subgraphs.c_str()}};
       // ToDo : Trt configs
       // "trt_int8_enable"
       // "trt_int8_use_native_calibration_table"
@@ -152,9 +167,8 @@ static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads,
 
         if (provider_config != nullptr) {
           options.device_id = provider_config->device;
-          options.cudnn_conv_algo_search =
-              OrtCudnnConvAlgoSearch(provider_config->cuda_config
-                .cudnn_conv_algo_search);
+          options.cudnn_conv_algo_search = OrtCudnnConvAlgoSearch(
+              provider_config->cuda_config.cudnn_conv_algo_search);
         } else {
           options.device_id = 0;
           // Default OrtCudnnConvAlgoSearchExhaustive is extremely slow
@@ -220,7 +234,8 @@ static Ort::SessionOptions GetSessionOptionsImpl(int32_t num_threads,
 
 Ort::SessionOptions GetSessionOptions(const OnlineModelConfig &config) {
   return GetSessionOptionsImpl(config.num_threads,
-        config.provider_config.provider, &config.provider_config);
+                               config.provider_config.provider,
+                               &config.provider_config);
 }
 
 Ort::SessionOptions GetSessionOptions(const OfflineModelConfig &config) {

From 1fbce1a68257a1acccccd73428f7fb2d0c6faac5 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Thu, 11 Jul 2024 20:38:02 +0800
Subject: [PATCH 3/5] Add CI for ROCM

---
 .github/workflows/linux-rocm.yaml | 205 ++++++++++++++++++++++++++++++
 1 file changed, 205 insertions(+)
 create mode 100644 .github/workflows/linux-rocm.yaml

diff --git a/.github/workflows/linux-rocm.yaml b/.github/workflows/linux-rocm.yaml
new file mode 100644
index 0000000000..0e90340589
--- /dev/null
+++ b/.github/workflows/linux-rocm.yaml
@@ -0,0 +1,205 @@
+name: linux-rocm
+
+on:
+  push:
+    branches:
+      - master
+      - rocm
+    tags:
+      - 'v[0-9]+.[0-9]+.[0-9]+*'
+    paths:
+      - '.github/workflows/linux-rocm.yaml'
+      - '.github/scripts/test-online-transducer.sh'
+      - '.github/scripts/test-online-paraformer.sh'
+      - '.github/scripts/test-offline-transducer.sh'
+      - '.github/scripts/test-offline-ctc.sh'
+      - '.github/scripts/test-online-ctc.sh'
+      - '.github/scripts/test-offline-tts.sh'
+      - 'CMakeLists.txt'
+      - 'cmake/**'
+      - 'sherpa-onnx/csrc/*'
+      - 'sherpa-onnx/c-api/*'
+      - 'c-api-examples/**'
+  pull_request:
+    branches:
+      - master
+    paths:
+      - '.github/workflows/linux-rocm.yaml'
+      - '.github/scripts/test-online-transducer.sh'
+      - '.github/scripts/test-online-paraformer.sh'
+      - '.github/scripts/test-offline-transducer.sh'
+      - '.github/scripts/test-offline-ctc.sh'
+      - '.github/scripts/test-online-ctc.sh'
+      - '.github/scripts/test-online-ctc.sh'
+      - '.github/scripts/test-offline-tts.sh'
+      - 'CMakeLists.txt'
+      - 'cmake/**'
+      - 'sherpa-onnx/csrc/*'
+      - 'sherpa-onnx/c-api/*'
+
+  workflow_dispatch:
+
+concurrency:
+  group: linux-rocm-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  linux_rocm:
+    runs-on: ${{ matrix.os }}
+    name: ${{ matrix.build_type }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest]
+        # build_type: [Release, Debug]
+        build_type: [Release]
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: ccache
+        uses: hendrikmuhs/ccache-action@v1.2
+        with:
+          key: ${{ matrix.os }}-${{ matrix.build_type }}-rocm
+
+      - name: Configure CMake
+        shell: bash
+        run: |
+          export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+          cmake --version
+
+          mkdir build
+          cd build
+          cmake \
+            -D CMAKE_BUILD_TYPE=${{ matrix.build_type }} \
+            -D CMAKE_INSTALL_PREFIX=./install \
+            -D BUILD_SHARED_LIBS=ON \
+            -D SHERPA_ONNX_ENABLE_ROCM=ON \
+            ..
+
+      - name: Build sherpa-onnx for ubuntu
+        shell: bash
+        run: |
+          export PATH="/usr/lib/ccache:/usr/local/opt/ccache/libexec:$PATH"
+
+          cd build
+          make -j2
+          make install
+
+          ls -lh lib
+          ls -lh bin
+
+          echo "----"
+          ls -lh install/lib
+
+          echo "----"
+          ls -lh install/bin
+
+      - name: Display dependencies of sherpa-onnx for linux
+        shell: bash
+        run: |
+          file build/bin/sherpa-onnx
+          readelf -d build/bin/sherpa-onnx
+          echo "----"
+          ldd build/bin/sherpa-onnx
+
+      - name: Test spoken language identification
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx-offline-language-identification
+
+          .github/scripts/test-spoken-language-identification.sh
+
+      - name: Test online CTC
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx
+
+          .github/scripts/test-online-ctc.sh
+
+      - name: Test offline TTS
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx-offline-tts
+
+          .github/scripts/test-offline-tts.sh
+
+      - name: Test online paraformer
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx
+
+          .github/scripts/test-online-paraformer.sh
+
+
+      - name: Test offline Whisper
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx-offline
+
+          .github/scripts/test-offline-whisper.sh
+
+      - name: Test offline CTC
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx-offline
+
+          .github/scripts/test-offline-ctc.sh
+
+      - name: Test offline transducer
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx-offline
+
+          .github/scripts/test-offline-transducer.sh
+
+      - name: Test online transducer
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=sherpa-onnx
+
+          .github/scripts/test-online-transducer.sh
+
+      - name: Test online transducer (C API)
+        shell: bash
+        run: |
+          export PATH=$PWD/build/bin:$PATH
+          export EXE=decode-file-c-api
+
+          .github/scripts/test-online-transducer.sh
+
+      - name: Copy files
+        if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
+        shell: bash
+        run: |
+          SHERPA_ONNX_VERSION=v$(grep "SHERPA_ONNX_VERSION" ./CMakeLists.txt  | cut -d " " -f 2  | cut -d '"' -f 2)
+
+          dst=sherpa-onnx-${SHERPA_ONNX_VERSION}-linux-x64-rocm
+          mkdir $dst
+
+          cp -a build/install/bin $dst/
+          cp -a build/install/lib $dst/
+          cp -a build/install/include $dst/
+
+          tree $dst
+
+          tar cjvf ${dst}.tar.bz2 $dst
+
+      - name: Release pre-compiled binaries and libs for linux x64
+        if: (github.repository_owner == 'csukuangfj' || github.repository_owner == 'k2-fsa') && github.event_name == 'push' && contains(github.ref, 'refs/tags/')
+        uses: svenstaro/upload-release-action@v2
+        with:
+          file_glob: true
+          overwrite: true
+          file: sherpa-onnx-*linux-x64-rocm.tar.bz2

From 1a58ae6421f08f32e5ede93a45243bc7b1b4ead5 Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Thu, 11 Jul 2024 20:40:22 +0800
Subject: [PATCH 4/5] fix typos

---
 CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aad04ef66b..2bec3ab233 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,7 +94,8 @@ Hint: You don't need sudo permission to install CUDA toolkit. Please refer to
   https://k2-fsa.github.io/k2/installation/cuda-cudnn.html
 to install CUDA toolkit if you have not installed it.")
   if(NOT BUILD_SHARED_LIBS)
-    message(STATUS "Set BUILD_SHARED_LIBS to ON since SHERPA_ONNX_ENABLE_GPU is ON") set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE)
+    message(STATUS "Set BUILD_SHARED_LIBS to ON since SHERPA_ONNX_ENABLE_GPU is ON")
+    set(BUILD_SHARED_LIBS ON CACHE BOOL "" FORCE)
   endif()
 endif()
 

From 5650413b7e50d3b1ce300bc45c439600b094e47c Mon Sep 17 00:00:00 2001
From: Fangjun Kuang <csukuangfj@gmail.com>
Date: Thu, 11 Jul 2024 21:29:53 +0800
Subject: [PATCH 5/5] fix style issues

---
 sherpa-onnx/csrc/provider-config.cc | 59 ++++++++++++++---------------
 1 file changed, 28 insertions(+), 31 deletions(-)

diff --git a/sherpa-onnx/csrc/provider-config.cc b/sherpa-onnx/csrc/provider-config.cc
index 3c8f0ee473..1db62aa6bc 100644
--- a/sherpa-onnx/csrc/provider-config.cc
+++ b/sherpa-onnx/csrc/provider-config.cc
@@ -13,14 +13,15 @@ namespace sherpa_onnx {
 
 void CudaConfig::Register(ParseOptions *po) {
   po->Register("cuda-cudnn-conv-algo-search", &cudnn_conv_algo_search,
-          "CuDNN convolution algrorithm search");
+               "CuDNN convolution algrorithm search");
 }
 
 bool CudaConfig::Validate() const {
   if (cudnn_conv_algo_search < 1 || cudnn_conv_algo_search > 3) {
-    SHERPA_ONNX_LOGE("cudnn_conv_algo_search: '%d' is not a valid option."
-                     "Options : [1,3]. Check OnnxRT docs",
-                    cudnn_conv_algo_search);
+    SHERPA_ONNX_LOGE(
+        "cudnn_conv_algo_search: '%d' is not a valid option."
+        "Options : [1,3]. Check OnnxRT docs",
+        cudnn_conv_algo_search);
     return false;
   }
   return true;
@@ -37,41 +38,41 @@ std::string CudaConfig::ToString() const {
 
 void TensorrtConfig::Register(ParseOptions *po) {
   po->Register("trt-max-workspace-size", &trt_max_workspace_size,
-              "Set TensorRT EP GPU memory usage limit.");
+               "Set TensorRT EP GPU memory usage limit.");
   po->Register("trt-max-partition-iterations", &trt_max_partition_iterations,
-              "Limit partitioning iterations for model conversion.");
+               "Limit partitioning iterations for model conversion.");
   po->Register("trt-min-subgraph-size", &trt_min_subgraph_size,
-              "Set minimum size for subgraphs in partitioning.");
+               "Set minimum size for subgraphs in partitioning.");
   po->Register("trt-fp16-enable", &trt_fp16_enable,
-              "Enable FP16 precision for faster performance.");
+               "Enable FP16 precision for faster performance.");
   po->Register("trt-detailed-build-log", &trt_detailed_build_log,
-              "Enable detailed logging of build steps.");
+               "Enable detailed logging of build steps.");
   po->Register("trt-engine-cache-enable", &trt_engine_cache_enable,
-              "Enable caching of TensorRT engines.");
+               "Enable caching of TensorRT engines.");
   po->Register("trt-timing-cache-enable", &trt_timing_cache_enable,
-              "Enable use of timing cache to speed up builds.");
+               "Enable use of timing cache to speed up builds.");
   po->Register("trt-engine-cache-path", &trt_engine_cache_path,
-              "Set path to store cached TensorRT engines.");
+               "Set path to store cached TensorRT engines.");
   po->Register("trt-timing-cache-path", &trt_timing_cache_path,
-              "Set path for storing timing cache.");
+               "Set path for storing timing cache.");
   po->Register("trt-dump-subgraphs", &trt_dump_subgraphs,
-              "Dump optimized subgraphs for debugging.");
+               "Dump optimized subgraphs for debugging.");
 }
 
 bool TensorrtConfig::Validate() const {
   if (trt_max_workspace_size < 0) {
-    SHERPA_ONNX_LOGE("trt_max_workspace_size: %lld is not valid.",
-        trt_max_workspace_size);
+    SHERPA_ONNX_LOGE("trt_max_workspace_size: %ld is not valid.",
+                     trt_max_workspace_size);
     return false;
   }
   if (trt_max_partition_iterations < 0) {
     SHERPA_ONNX_LOGE("trt_max_partition_iterations: %d is not valid.",
-        trt_max_partition_iterations);
+                     trt_max_partition_iterations);
     return false;
   }
   if (trt_min_subgraph_size < 0) {
     SHERPA_ONNX_LOGE("trt_min_subgraph_size: %d is not valid.",
-        trt_min_subgraph_size);
+                     trt_min_subgraph_size);
     return false;
   }
 
@@ -83,23 +84,19 @@ std::string TensorrtConfig::ToString() const {
 
   os << "TensorrtConfig(";
   os << "trt_max_workspace_size=" << trt_max_workspace_size << ", ";
-  os << "trt_max_partition_iterations="
-      << trt_max_partition_iterations << ", ";
+  os << "trt_max_partition_iterations=" << trt_max_partition_iterations << ", ";
   os << "trt_min_subgraph_size=" << trt_min_subgraph_size << ", ";
-  os << "trt_fp16_enable=\""
-      << (trt_fp16_enable? "True" : "False") << "\", ";
+  os << "trt_fp16_enable=\"" << (trt_fp16_enable ? "True" : "False") << "\", ";
   os << "trt_detailed_build_log=\""
-      << (trt_detailed_build_log? "True" : "False") << "\", ";
+     << (trt_detailed_build_log ? "True" : "False") << "\", ";
   os << "trt_engine_cache_enable=\""
-      << (trt_engine_cache_enable? "True" : "False") << "\", ";
-  os << "trt_engine_cache_path=\""
-      << trt_engine_cache_path.c_str() << "\", ";
+     << (trt_engine_cache_enable ? "True" : "False") << "\", ";
+  os << "trt_engine_cache_path=\"" << trt_engine_cache_path.c_str() << "\", ";
   os << "trt_timing_cache_enable=\""
-      << (trt_timing_cache_enable? "True" : "False") << "\", ";
-  os << "trt_timing_cache_path=\""
-      << trt_timing_cache_path.c_str() << "\",";
-  os << "trt_dump_subgraphs=\""
-      << (trt_dump_subgraphs? "True" : "False") << "\" )";
+     << (trt_timing_cache_enable ? "True" : "False") << "\", ";
+  os << "trt_timing_cache_path=\"" << trt_timing_cache_path.c_str() << "\",";
+  os << "trt_dump_subgraphs=\"" << (trt_dump_subgraphs ? "True" : "False")
+     << "\" )";
   return os.str();
 }