intel · jatinwadhwa921 · Apr 10, 2025 · Apr 10, 2025 · Apr 11, 2025 · Apr 11, 2025
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
@@ -405,7 +405,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   //       and ChromeCanary is not in CI.
 
   const defaultBrowserBackends = ['webgl', 'webgpu', 'wasm' /*, 'webnn'*/];
-  const nodejsBackends = ['cpu', 'wasm'];
+  const nodejsBackends = ['cpu', 'wasm', 'webgpu'];
   const backendArgs = args.backend || args.b;
   const backend =
     typeof backendArgs !== 'string'

diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
@@ -13,7 +13,16 @@ import { Logger } from '../lib/onnxjs/instrument';
 
 import { Test } from './test-types';
 
-if (ORT_WEB_TEST_CONFIG.model.some((testGroup) => testGroup.tests.some((test) => test.backend === 'cpu'))) {
+if (
+  // when NPM test is launched with `-e=node` and (`-b=cpu` or `-b=webgpu`), load ONNXRuntime Node.js binding.
+  platform.name === 'Node.js' &&
+  (ORT_WEB_TEST_CONFIG.model.some((testGroup) =>
+    testGroup.tests.some((test) => test.backend === 'cpu' || test.backend === 'webgpu'),
+  ) ||
+    ORT_WEB_TEST_CONFIG.op.some((testGroup) =>
+      testGroup.tests.some((test) => test.backend === 'cpu' || test.backend === 'webgpu'),
+    ))
+) {
   // require onnxruntime-node
   require('../../node');
 }

diff --git a/onnxruntime/core/framework/transpose_helper.cc b/onnxruntime/core/framework/transpose_helper.cc
@@ -22,7 +22,8 @@ struct has_mlas_transpose<uint32_t> : std::true_type {};
 template <typename T>
 typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisOutwards(
     const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop,
-    int64_t writes_per_writer_per_loop) {
+    int64_t writes_per_writer_per_loop, concurrency::ThreadPool* tp = nullptr) {
+  ORT_UNUSED_PARAMETER(tp);
   const T* end;
   for (int64_t l = 0; l < num_loops; ++l) {
     T* output_for_first_writer = output_data;
@@ -48,10 +49,10 @@ typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTranspo
 template <typename T>
 typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisOutwards(
     const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop,
-    int64_t writes_per_writer_per_loop) {
+    int64_t writes_per_writer_per_loop, concurrency::ThreadPool* tp = nullptr) {
   for (int64_t l = 0; l < num_loops; ++l) {
     MlasTranspose(input_data, output_data, static_cast<size_t>(writes_per_writer_per_loop),
-                  static_cast<size_t>(num_writers));
+                  static_cast<size_t>(num_writers), tp);
     input_data += writes_per_loop;
     output_data += writes_per_loop;
   }
@@ -82,25 +83,25 @@ void TransposeSingleAxisOutwards(gsl::span<const size_t> permutations, const Ten
   switch (bytes_per_write) {
     case (sizeof(uint8_t)): {
       SimpleTransposeSingleAxisOutwards(input_data, output_data, num_loops, num_writers, writes_per_loop,
-                                        writes_per_writer_per_loop);
+                                        writes_per_writer_per_loop, tp);
       break;
     }
     case (sizeof(uint16_t)): {
       SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint16_t*>(input_data),
                                         reinterpret_cast<uint16_t*>(output_data), num_loops, num_writers,
-                                        writes_per_loop, writes_per_writer_per_loop);
+                                        writes_per_loop, writes_per_writer_per_loop, tp);
       break;
     }
     case (sizeof(uint32_t)): {
       SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint32_t*>(input_data),
                                         reinterpret_cast<uint32_t*>(output_data), num_loops, num_writers,
-                                        writes_per_loop, writes_per_writer_per_loop);
+                                        writes_per_loop, writes_per_writer_per_loop, tp);
       break;
     }
     case (sizeof(uint64_t)): {
       SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint64_t*>(input_data),
                                         reinterpret_cast<uint64_t*>(output_data), num_loops, num_writers,
-                                        writes_per_loop, writes_per_writer_per_loop);
+                                        writes_per_loop, writes_per_writer_per_loop, tp);
       break;
     }
     default: {
@@ -125,7 +126,8 @@ void TransposeSingleAxisOutwards(gsl::span<const size_t> permutations, const Ten
 template <typename T>
 typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisInwards(
     const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop,
-    int64_t reads_per_reader_per_loop) {
+    int64_t reads_per_reader_per_loop, concurrency::ThreadPool* tp = nullptr) {
+  ORT_UNUSED_PARAMETER(tp);
   T* end;
   for (int64_t l = 0; l < num_loops; ++l) {
     const T* input_for_first_reader = input_data;
@@ -150,10 +152,10 @@ typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTranspo
 template <typename T>
 typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisInwards(
     const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop,
-    int64_t reads_per_reader_per_loop) {
+    int64_t reads_per_reader_per_loop, concurrency::ThreadPool* tp = nullptr) {
   for (int64_t l = 0; l < num_loops; ++l) {
     MlasTranspose(input_data, output_data, static_cast<size_t>(num_readers),
-                  static_cast<size_t>(reads_per_reader_per_loop));
+                  static_cast<size_t>(reads_per_reader_per_loop), tp);
     input_data += reads_per_loop;
     output_data += reads_per_loop;
   }
@@ -162,7 +164,8 @@ typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTranspos
 // moving a single axis inwards where the read/write size is a power of 2 and between 8 and 64 bits.
 //  `input_shape_override` overrides the shape of `input` for compute purposes.
 void TransposeSingleAxisInwards(gsl::span<const size_t> permutations, const Tensor& input, Tensor& output,
-                                size_t from, size_t to, const TensorShape* input_shape_override = nullptr) {
+                                size_t from, size_t to, const TensorShape* input_shape_override = nullptr,
+                                concurrency::ThreadPool* tp = nullptr) {
   ORT_UNUSED_PARAMETER(permutations);
 
   const auto& input_shape = input_shape_override ? *input_shape_override : input.Shape();
@@ -184,25 +187,25 @@ void TransposeSingleAxisInwards(gsl::span<const size_t> permutations, const Tens
   switch (bytes_per_read) {
     case (sizeof(uint8_t)): {
       SimpleTransposeSingleAxisInwards(input_data, output_data, num_loops, num_readers, reads_per_loop,
-                                       reads_per_reader_per_loop);
+                                       reads_per_reader_per_loop, tp);
       break;
     }
     case (sizeof(uint16_t)): {
       SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint16_t*>(input_data),
                                        reinterpret_cast<uint16_t*>(output_data), num_loops, num_readers, reads_per_loop,
-                                       reads_per_reader_per_loop);
+                                       reads_per_reader_per_loop, tp);
       break;
     }
     case (sizeof(uint32_t)): {
       SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint32_t*>(input_data),
                                        reinterpret_cast<uint32_t*>(output_data), num_loops, num_readers, reads_per_loop,
-                                       reads_per_reader_per_loop);
+                                       reads_per_reader_per_loop, tp);
       break;
     }
     case (sizeof(uint64_t)): {
       SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint64_t*>(input_data),
                                        reinterpret_cast<uint64_t*>(output_data), num_loops, num_readers, reads_per_loop,
-                                       reads_per_reader_per_loop);
+                                       reads_per_reader_per_loop, tp);
       break;
     }
     default: {
@@ -236,7 +239,7 @@ void SingleAxisTranspose(gsl::span<const size_t> permutations, const Tensor& inp
   if (from > to) {
     TransposeSingleAxisOutwards(permutations, input, output, from, to, input_shape_override, tp);
   } else {
-    TransposeSingleAxisInwards(permutations, input, output, from, to, input_shape_override);
+    TransposeSingleAxisInwards(permutations, input, output, from, to, input_shape_override, tp);
   }
 }
 

diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
@@ -1056,49 +1056,15 @@ MlasComputeTanh(
 // Transpose routines.
 //
 
+template<typename DataType>
 void
 MLASCALL
 MlasTranspose(
-    const uint8_t* Input,
-    uint8_t* Output,
-    size_t M,
-    size_t N
-    );
-
-void
-MLASCALL
-MlasTranspose(
-    const int8_t* Input,
-    int8_t* Output,
-    size_t M,
-    size_t N
-    );
-
-void
-MLASCALL
-MlasTranspose(
-    const uint16_t* Input,
-    uint16_t* Output,
-    size_t M,
-    size_t N
-    );
-
-void
-MLASCALL
-MlasTranspose(
-    const uint32_t* Input,
-    uint32_t* Output,
+    const DataType* Input,
+    DataType* Output,
     size_t M,
-    size_t N
-    );
-
-void
-MLASCALL
-MlasTranspose(
-    const float* Input,
-    float* Output,
-    size_t M,
-    size_t N
+    size_t N,
+    MLAS_THREADPOOL* ThreadPool
     );
 
 //
@@ -1940,20 +1906,22 @@ MlasConvDepthwise(
     MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
     );
 
-
 inline
 void
 MlasTranspose(
     const MLAS_FP16* Input,
     MLAS_FP16* Output,
     size_t M,
-    size_t N
+    size_t N,
+    MLAS_THREADPOOL* ThreadPool
     )
 {
     MlasTranspose(
         reinterpret_cast<const uint16_t*>(Input),
         reinterpret_cast<uint16_t*>(Output),
-        M, N);
+        M,
+        N,
+        ThreadPool);
 }