Skip to content

Commit 6c9711b

Browse files
Merge branch 'master' into sync_msft_11_4_25
2 parents c2558f3 + 7a03764 commit 6c9711b

File tree

14 files changed

+306
-189
lines changed

14 files changed

+306
-189
lines changed

js/web/script/test-runner-cli-args.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -405,7 +405,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
405405
// and ChromeCanary is not in CI.
406406

407407
const defaultBrowserBackends = ['webgl', 'webgpu', 'wasm' /*, 'webnn'*/];
408-
const nodejsBackends = ['cpu', 'wasm'];
408+
const nodejsBackends = ['cpu', 'wasm', 'webgpu'];
409409
const backendArgs = args.backend || args.b;
410410
const backend =
411411
typeof backendArgs !== 'string'

js/web/test/test-main.ts

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,16 @@ import { Logger } from '../lib/onnxjs/instrument';
1313

1414
import { Test } from './test-types';
1515

16-
if (ORT_WEB_TEST_CONFIG.model.some((testGroup) => testGroup.tests.some((test) => test.backend === 'cpu'))) {
16+
if (
17+
// when NPM test is launched with `-e=node` and (`-b=cpu` or `-b=webgpu`), load ONNXRuntime Node.js binding.
18+
platform.name === 'Node.js' &&
19+
(ORT_WEB_TEST_CONFIG.model.some((testGroup) =>
20+
testGroup.tests.some((test) => test.backend === 'cpu' || test.backend === 'webgpu'),
21+
) ||
22+
ORT_WEB_TEST_CONFIG.op.some((testGroup) =>
23+
testGroup.tests.some((test) => test.backend === 'cpu' || test.backend === 'webgpu'),
24+
))
25+
) {
1726
// require onnxruntime-node
1827
require('../../node');
1928
}

onnxruntime/core/framework/transpose_helper.cc

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,8 @@ struct has_mlas_transpose<uint32_t> : std::true_type {};
2222
template <typename T>
2323
typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisOutwards(
2424
const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop,
25-
int64_t writes_per_writer_per_loop) {
25+
int64_t writes_per_writer_per_loop, concurrency::ThreadPool* tp = nullptr) {
26+
ORT_UNUSED_PARAMETER(tp);
2627
const T* end;
2728
for (int64_t l = 0; l < num_loops; ++l) {
2829
T* output_for_first_writer = output_data;
@@ -48,10 +49,10 @@ typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTranspo
4849
template <typename T>
4950
typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisOutwards(
5051
const T* input_data, T* output_data, int64_t num_loops, int64_t num_writers, int64_t writes_per_loop,
51-
int64_t writes_per_writer_per_loop) {
52+
int64_t writes_per_writer_per_loop, concurrency::ThreadPool* tp = nullptr) {
5253
for (int64_t l = 0; l < num_loops; ++l) {
5354
MlasTranspose(input_data, output_data, static_cast<size_t>(writes_per_writer_per_loop),
54-
static_cast<size_t>(num_writers));
55+
static_cast<size_t>(num_writers), tp);
5556
input_data += writes_per_loop;
5657
output_data += writes_per_loop;
5758
}
@@ -82,25 +83,25 @@ void TransposeSingleAxisOutwards(gsl::span<const size_t> permutations, const Ten
8283
switch (bytes_per_write) {
8384
case (sizeof(uint8_t)): {
8485
SimpleTransposeSingleAxisOutwards(input_data, output_data, num_loops, num_writers, writes_per_loop,
85-
writes_per_writer_per_loop);
86+
writes_per_writer_per_loop, tp);
8687
break;
8788
}
8889
case (sizeof(uint16_t)): {
8990
SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint16_t*>(input_data),
9091
reinterpret_cast<uint16_t*>(output_data), num_loops, num_writers,
91-
writes_per_loop, writes_per_writer_per_loop);
92+
writes_per_loop, writes_per_writer_per_loop, tp);
9293
break;
9394
}
9495
case (sizeof(uint32_t)): {
9596
SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint32_t*>(input_data),
9697
reinterpret_cast<uint32_t*>(output_data), num_loops, num_writers,
97-
writes_per_loop, writes_per_writer_per_loop);
98+
writes_per_loop, writes_per_writer_per_loop, tp);
9899
break;
99100
}
100101
case (sizeof(uint64_t)): {
101102
SimpleTransposeSingleAxisOutwards(reinterpret_cast<const uint64_t*>(input_data),
102103
reinterpret_cast<uint64_t*>(output_data), num_loops, num_writers,
103-
writes_per_loop, writes_per_writer_per_loop);
104+
writes_per_loop, writes_per_writer_per_loop, tp);
104105
break;
105106
}
106107
default: {
@@ -125,7 +126,8 @@ void TransposeSingleAxisOutwards(gsl::span<const size_t> permutations, const Ten
125126
template <typename T>
126127
typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisInwards(
127128
const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop,
128-
int64_t reads_per_reader_per_loop) {
129+
int64_t reads_per_reader_per_loop, concurrency::ThreadPool* tp = nullptr) {
130+
ORT_UNUSED_PARAMETER(tp);
129131
T* end;
130132
for (int64_t l = 0; l < num_loops; ++l) {
131133
const T* input_for_first_reader = input_data;
@@ -150,10 +152,10 @@ typename std::enable_if<!has_mlas_transpose<T>::value, void>::type SimpleTranspo
150152
template <typename T>
151153
typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTransposeSingleAxisInwards(
152154
const T* input_data, T* output_data, int64_t num_loops, int64_t num_readers, int64_t reads_per_loop,
153-
int64_t reads_per_reader_per_loop) {
155+
int64_t reads_per_reader_per_loop, concurrency::ThreadPool* tp = nullptr) {
154156
for (int64_t l = 0; l < num_loops; ++l) {
155157
MlasTranspose(input_data, output_data, static_cast<size_t>(num_readers),
156-
static_cast<size_t>(reads_per_reader_per_loop));
158+
static_cast<size_t>(reads_per_reader_per_loop), tp);
157159
input_data += reads_per_loop;
158160
output_data += reads_per_loop;
159161
}
@@ -162,7 +164,8 @@ typename std::enable_if<has_mlas_transpose<T>::value, void>::type SimpleTranspos
162164
// moving a single axis inwards where the read/write size is a power of 2 and between 8 and 64 bits.
163165
// `input_shape_override` overrides the shape of `input` for compute purposes.
164166
void TransposeSingleAxisInwards(gsl::span<const size_t> permutations, const Tensor& input, Tensor& output,
165-
size_t from, size_t to, const TensorShape* input_shape_override = nullptr) {
167+
size_t from, size_t to, const TensorShape* input_shape_override = nullptr,
168+
concurrency::ThreadPool* tp = nullptr) {
166169
ORT_UNUSED_PARAMETER(permutations);
167170

168171
const auto& input_shape = input_shape_override ? *input_shape_override : input.Shape();
@@ -184,25 +187,25 @@ void TransposeSingleAxisInwards(gsl::span<const size_t> permutations, const Tens
184187
switch (bytes_per_read) {
185188
case (sizeof(uint8_t)): {
186189
SimpleTransposeSingleAxisInwards(input_data, output_data, num_loops, num_readers, reads_per_loop,
187-
reads_per_reader_per_loop);
190+
reads_per_reader_per_loop, tp);
188191
break;
189192
}
190193
case (sizeof(uint16_t)): {
191194
SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint16_t*>(input_data),
192195
reinterpret_cast<uint16_t*>(output_data), num_loops, num_readers, reads_per_loop,
193-
reads_per_reader_per_loop);
196+
reads_per_reader_per_loop, tp);
194197
break;
195198
}
196199
case (sizeof(uint32_t)): {
197200
SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint32_t*>(input_data),
198201
reinterpret_cast<uint32_t*>(output_data), num_loops, num_readers, reads_per_loop,
199-
reads_per_reader_per_loop);
202+
reads_per_reader_per_loop, tp);
200203
break;
201204
}
202205
case (sizeof(uint64_t)): {
203206
SimpleTransposeSingleAxisInwards(reinterpret_cast<const uint64_t*>(input_data),
204207
reinterpret_cast<uint64_t*>(output_data), num_loops, num_readers, reads_per_loop,
205-
reads_per_reader_per_loop);
208+
reads_per_reader_per_loop, tp);
206209
break;
207210
}
208211
default: {
@@ -236,7 +239,7 @@ void SingleAxisTranspose(gsl::span<const size_t> permutations, const Tensor& inp
236239
if (from > to) {
237240
TransposeSingleAxisOutwards(permutations, input, output, from, to, input_shape_override, tp);
238241
} else {
239-
TransposeSingleAxisInwards(permutations, input, output, from, to, input_shape_override);
242+
TransposeSingleAxisInwards(permutations, input, output, from, to, input_shape_override, tp);
240243
}
241244
}
242245

onnxruntime/core/mlas/inc/mlas.h

Lines changed: 10 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -1056,49 +1056,15 @@ MlasComputeTanh(
10561056
// Transpose routines.
10571057
//
10581058

1059+
template<typename DataType>
10591060
void
10601061
MLASCALL
10611062
MlasTranspose(
1062-
const uint8_t* Input,
1063-
uint8_t* Output,
1064-
size_t M,
1065-
size_t N
1066-
);
1067-
1068-
void
1069-
MLASCALL
1070-
MlasTranspose(
1071-
const int8_t* Input,
1072-
int8_t* Output,
1073-
size_t M,
1074-
size_t N
1075-
);
1076-
1077-
void
1078-
MLASCALL
1079-
MlasTranspose(
1080-
const uint16_t* Input,
1081-
uint16_t* Output,
1082-
size_t M,
1083-
size_t N
1084-
);
1085-
1086-
void
1087-
MLASCALL
1088-
MlasTranspose(
1089-
const uint32_t* Input,
1090-
uint32_t* Output,
1063+
const DataType* Input,
1064+
DataType* Output,
10911065
size_t M,
1092-
size_t N
1093-
);
1094-
1095-
void
1096-
MLASCALL
1097-
MlasTranspose(
1098-
const float* Input,
1099-
float* Output,
1100-
size_t M,
1101-
size_t N
1066+
size_t N,
1067+
MLAS_THREADPOOL* ThreadPool
11021068
);
11031069

11041070
//
@@ -1940,20 +1906,22 @@ MlasConvDepthwise(
19401906
MLAS_HALF_GEMM_POSTPROCESSOR* PostProc
19411907
);
19421908

1943-
19441909
inline
19451910
void
19461911
MlasTranspose(
19471912
const MLAS_FP16* Input,
19481913
MLAS_FP16* Output,
19491914
size_t M,
1950-
size_t N
1915+
size_t N,
1916+
MLAS_THREADPOOL* ThreadPool
19511917
)
19521918
{
19531919
MlasTranspose(
19541920
reinterpret_cast<const uint16_t*>(Input),
19551921
reinterpret_cast<uint16_t*>(Output),
1956-
M, N);
1922+
M,
1923+
N,
1924+
ThreadPool);
19571925
}
19581926

19591927

0 commit comments

Comments
 (0)