Skip to content

Commit 825097e

Browse files
committed
Merge remote-tracking branch 'origin/fs-eire/webgpu-ep' into webgpu-profiling
2 parents 9f09601 + 8261ca6 commit 825097e

File tree

61 files changed

+1227
-406
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

61 files changed

+1227
-406
lines changed

cgmanifests/generated/cgmanifest.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
"component": {
3737
"type": "git",
3838
"git": {
39-
"commitHash": "f46495ea96f68fc3f6c394f099b2992743f6ff7f",
39+
"commitHash": "4447c7562e3bc702ade25105912dce503f0c4010",
4040
"repositoryUrl": "https://github.com/abseil/abseil-cpp.git"
4141
},
4242
"comments": "abseil_cpp"

cmake/deps.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# NOTE: You must run deps_update_and_upload.py and generate_cgmanifest.py when ready to test your changes in a CI.
1313
# See https://microsoft.sharepoint.com/teams/ONNX2/_layouts/OneNote.aspx?id=%2Fteams%2FONNX2%2FShared%20Documents%2FNotebooks%2FONNX%20Ecosystem%20Team%20Notebook&wd=target%28Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FAdd%20or%20update%20a%20dependency%20in%20deps.txt%7C0E9ED71D-89D5-40FA-B05F-C0123289C591%2F%29
1414
#
15-
abseil_cpp;https://github.com/abseil/abseil-cpp/archive/f46495ea96f68fc3f6c394f099b2992743f6ff7f.zip;0e2b6d1dc7f0a808d1e23f7dd985f7bc18d52cbc
15+
abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20240722.0.zip;36ee53eb1466fb6e593fc5c286680de31f8a494a
1616
coremltools;https://github.com/apple/coremltools/archive/refs/tags/7.1.zip;f1bab0f30966f2e217d8e01207d518f230a1641a
1717
cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0
1818
date;https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.zip;2dac0c81dc54ebdd8f8d073a75c053b04b56e159

cmake/external/abseil-cpp.cmake

+1-1
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ FetchContent_Declare(
2727
URL ${DEP_URL_abseil_cpp}
2828
URL_HASH SHA1=${DEP_SHA1_abseil_cpp}
2929
PATCH_COMMAND ${ABSL_PATCH_COMMAND}
30-
FIND_PACKAGE_ARGS NAMES absl
30+
FIND_PACKAGE_ARGS 20240722 NAMES absl
3131
)
3232

3333
onnxruntime_fetchcontent_makeavailable(abseil_cpp)

cmake/external/abseil-cpp.natvis

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
<?xml version="1.0" encoding="utf-8"?>
22
<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
3-
<Type Name="absl::lts_20240116::InlinedVector&lt;*&gt;">
3+
<Type Name="absl::lts_20240722::InlinedVector&lt;*&gt;">
44
<Intrinsic Name="_size" Expression="storage_.metadata_.value >> 1"/>
55
<Intrinsic Name="_is_allocated" Expression="(storage_.metadata_.value &amp; 1) == 1"/>
66
<Intrinsic Name="_inlined_data" Expression="($T1*)storage_.data_.inlined.inlined_data"/>

cmake/patches/onnx/onnx.patch

+558
Large diffs are not rendered by default.

dockerfiles/Dockerfile.cuda

+39-16
Original file line numberDiff line numberDiff line change
@@ -2,16 +2,19 @@
22
# Copyright (c) Microsoft Corporation. All rights reserved.
33
# Licensed under the MIT License.
44
# --------------------------------------------------------------
5-
# Build onnxruntime-gpu python package with CUDA 12.6 & CUDNN 9.4 for python 3.12 in Ubuntu 24.04 for Nvidia GPU.
5+
# Build onnxruntime-gpu python package with CUDA 12.x & CUDNN 9.x for python 3.12 in Ubuntu 24.04.
66
# If memory is less than 64GB, you may change "--parallel" to "--parallel 4" to avoid out-of-memory error.
77

8-
FROM nvcr.io/nvidia/cuda:12.6.1-devel-ubuntu24.04
8+
ARG CUDA_VERSION=12.6.1
9+
ARG CUDNN_VERSION=9.5.0.50
10+
ARG OS=ubuntu24.04
911

10-
# Target CUDA device with compute capability >= 6.1
12+
FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${OS}
13+
ARG CUDA_VERSION
14+
ARG CUDNN_VERSION
1115
ARG CMAKE_CUDA_ARCHITECTURES="61;70;75;80;86;90"
1216

1317
ENV DEBIAN_FRONTEND=noninteractive
14-
MAINTAINER Changming Sun "[email protected]"
1518

1619
# Add source code to /code
1720
ADD . /code
@@ -34,9 +37,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
3437
&& rm -rf /var/lib/apt/lists/*
3538

3639
# Install CUDNN 9.4.0.58 for building ONNX Runtime with CUDA.
37-
RUN wget https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/cudnn-linux-x86_64-9.4.0.58_cuda12-archive.tar.xz \
40+
RUN cudnn_tar="cudnn-linux-x86_64-${CUDNN_VERSION}_cuda${CUDA_VERSION%%.*}-archive.tar.xz" \
41+
&& wget "https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-x86_64/${cudnn_tar}" \
3842
&& mkdir -p /code/build/cudnn \
39-
&& tar -Jxvf cudnn-linux-x86_64-9.4.0.58_cuda12-archive.tar.xz -C /code/build/cudnn --strip=1
43+
&& tar -Jxvf ${cudnn_tar} -C /code/build/cudnn --strip=1 \
44+
&& rm -f ${cudnn_tar}
4045

4146
# Create a virtual environment and install dependencies, then build ONNX Runtime with CUDA support.
4247
RUN cd /code \
@@ -55,34 +60,52 @@ RUN cd /code \
5560
--cmake_extra_defines ONNXRUNTIME_VERSION=$(cat ./VERSION_NUMBER) "CMAKE_CUDA_ARCHITECTURES=${CMAKE_CUDA_ARCHITECTURES}" onnxruntime_BUILD_UNIT_TESTS=OFF
5661

5762
# Start second stage to copy the build artifacts
58-
FROM nvcr.io/nvidia/cuda:12.6.1-runtime-ubuntu24.04
59-
ENV DEBIAN_FRONTEND=noninteractive
63+
FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-runtime-${OS}
64+
ARG CUDA_VERSION
65+
ARG CUDNN_VERSION
66+
ARG GIT_COMMIT
67+
ARG GIT_BRANCH
68+
ARG ONNXRUNTIME_VERSION
69+
70+
# Make sure the required build arguments are set. See README.md for more information.
71+
RUN test -n ${GIT_COMMIT:?}
72+
RUN test -n ${GIT_BRANCH:?}
73+
RUN test -n ${ONNXRUNTIME_VERSION:?}
74+
75+
LABEL CUDA_VERSION="${CUDA_VERSION}"
76+
LABEL CUDNN_VERSION="${CUDNN_VERSION}"
77+
LABEL maintainer="Changming Sun <[email protected]>"
78+
LABEL onnxruntime_version="${ONNXRUNTIME_VERSION}"
79+
LABEL onnxruntime_git_branch="${GIT_BRANCH}"
80+
LABEL onnxruntime_git_commit="${GIT_COMMIT}"
6081

6182
# Copy built wheel and license
6283
COPY --from=0 /code/build/Linux/Release/dist /ort
6384
COPY --from=0 /code/dockerfiles/LICENSE-IMAGE.txt /code/LICENSE-IMAGE.txt
6485

65-
# Set LD_LIBRARY_PATH so that runtime can load CUDA and CUDNN DLLs.
66-
# CUDNN will be installed by nvidia-cudnn-cu12 python package later.
67-
# Its location is in the site-packages directory, which can be retrieved like the following:
68-
# python -c "import sysconfig; print(sysconfig.get_path('purelib'))"
86+
# Set environment variables
87+
ENV DEBIAN_FRONTEND=noninteractive
88+
ENV CUDNN_VERSION=$CUDNN_VERSION
89+
ENV ONNXRUNTIME_VERSION=$ONNXRUNTIME_VERSION
90+
# CUDNN from nvidia-cudnn-cu12 python package is located in the site-packages directory of python virtual environment.
6991
ENV LD_LIBRARY_PATH="/ort/env/lib/python3.12/site-packages/nvidia/cudnn/lib:/usr/local/cuda/lib64"
7092

71-
# Install runtime dependencies, and run a simple test to verify the installation.
93+
# Install runtime dependencies
7294
RUN apt-get update && apt-get install -y --no-install-recommends \
7395
libstdc++6 \
7496
ca-certificates \
7597
python3-pip \
7698
python3.12-venv \
77-
unattended-upgrades \
78-
&& unattended-upgrade \
7999
&& python3 -m venv /ort/env \
80100
&& . /ort/env/bin/activate \
81101
&& pip install /ort/*.whl \
82-
&& pip install nvidia-cudnn-cu12==9.4.0.58 \
102+
&& pip install nvidia-cudnn-cu${CUDA_VERSION%%.*}==${CUDNN_VERSION} \
83103
&& python -c 'import onnxruntime; print(onnxruntime.get_available_providers())' \
84104
&& rm -rf /ort/*.whl \
85105
&& rm -rf /var/lib/apt/lists/*
86106

87107
# Ensure the virtual environment is always activated when running commands in the container.
88108
RUN echo ". /ort/env/bin/activate" >> ~/.bashrc
109+
110+
# Set the default command to start an interactive bash shell
111+
CMD [ "/bin/bash" ]

dockerfiles/README.md

+18-3
Original file line numberDiff line numberDiff line change
@@ -40,18 +40,33 @@ The docker file supports both x86_64 and ARM64(aarch64). You may use docker's "-
4040
However, we cannot build the code for 32-bit ARM in such a way since a 32-bit compiler/linker might not have enough memory to generate the binaries.
4141

4242
## CUDA
43-
**Ubuntu 22.04, CUDA 12.1, CuDNN 8**
43+
**Ubuntu 24.04, CUDA 12.x, CuDNN 9.x**
4444

4545
1. Build the docker image from the Dockerfile in this repository.
46+
Choose available [cuda version](https://hub.docker.com/r/nvidia/cuda/tags) or [cudnn version](https://pypi.org/project/nvidia-cudnn-cu12/#history), then build docker image like the following:
47+
4648
```
47-
docker build -t onnxruntime-cuda -f Dockerfile.cuda ..
49+
git submodule update --init
50+
docker build -t onnxruntime-cuda --build-arg CUDA_VERSION=12.6.1 \
51+
--build-arg CUDNN_VERSION=9.5.0.50 \
52+
--build-arg GIT_BRANCH=$(git rev-parse --abbrev-ref HEAD) \
53+
--build-arg GIT_COMMIT=$(git rev-parse HEAD) \
54+
--build-arg ONNXRUNTIME_VERSION=$(cat ../VERSION_NUMBER) \
55+
-f Dockerfile.cuda ..
56+
4857
```
4958

59+
To inspect the labels of the built image, run the following:
60+
```
61+
docker inspect onnxruntime-cuda
62+
```
5063
2. Run the Docker image
5164

5265
```
53-
docker run --gpus all -it onnxruntime-cuda
66+
docker run --rm --gpus all -it onnxruntime-cuda
67+
```
5468
or
69+
```
5570
nvidia-docker run -it onnxruntime-cuda
5671
5772
```

include/onnxruntime/core/framework/execution_provider.h

+8
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,14 @@ class IExecutionProvider {
214214
return Status::OK();
215215
}
216216

217+
/**
218+
Called when InferenceSession::SetEpDynamicOptions is called
219+
*/
220+
virtual common::Status SetEpDynamicOptions(gsl::span<const char* const> /*keys*/,
221+
gsl::span<const char* const> /*values*/) {
222+
return Status::OK();
223+
}
224+
217225
/**
218226
Indicate whether the graph capturing mode (e.g., cuda graph) is enabled for
219227
the provider.

include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h

+10-55
Original file line numberDiff line numberDiff line change
@@ -695,7 +695,7 @@ class RunQueue {
695695

696696
static std::atomic<uint32_t> next_tag{1};
697697

698-
template <typename Environment, bool kIsHybrid>
698+
template <typename Environment>
699699
class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInterface {
700700
private:
701701
struct PerThread;
@@ -767,29 +767,6 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
767767
typedef std::function<void()> Task;
768768
typedef RunQueue<Task, Tag, 1024> Queue;
769769

770-
// Class for waiting w/ exponential backoff.
771-
// Template argument is maximum number of spins in backoff loop.
772-
template <unsigned kMaxBackoff>
773-
class ThreadPoolWaiter {
774-
// Current number if spins in backoff loop
775-
unsigned pause_time_;
776-
777-
public:
778-
void wait() {
779-
// If kMaxBackoff is zero don't do any pausing.
780-
if constexpr (kMaxBackoff == 1) {
781-
onnxruntime::concurrency::SpinPause();
782-
} else if constexpr (kMaxBackoff > 1) {
783-
// Exponential backoff
784-
unsigned pause_time = pause_time_ + 1U;
785-
for (unsigned i = 0; i < pause_time; ++i) {
786-
onnxruntime::concurrency::SpinPause();
787-
}
788-
pause_time_ = (pause_time * 2U) % kMaxBackoff;
789-
}
790-
}
791-
};
792-
793770
ThreadPoolTempl(const CHAR_TYPE* name, int num_threads, bool allow_spinning, Environment& env,
794771
const ThreadOptions& thread_options)
795772
: profiler_(num_threads, name),
@@ -931,9 +908,8 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
931908
// finish dispatch work. This avoids new tasks being started
932909
// concurrently with us attempting to end the parallel section.
933910
if (ps.dispatch_q_idx != -1) {
934-
ThreadPoolWaiter<4> waiter{};
935911
while (!ps.dispatch_done.load(std::memory_order_acquire)) {
936-
waiter.wait();
912+
onnxruntime::concurrency::SpinPause();
937913
}
938914
}
939915

@@ -955,17 +931,15 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
955931

956932
// Wait for the dispatch task's own work...
957933
if (ps.dispatch_q_idx > -1) {
958-
ThreadPoolWaiter<kIsHybrid ? 0 : 1> waiter{};
959934
while (!ps.work_done.load(std::memory_order_acquire)) {
960-
waiter.wait();
935+
onnxruntime::concurrency::SpinPause();
961936
}
962937
}
963938

964939
// ...and wait for any other tasks not revoked to finish their work
965940
auto tasks_to_wait_for = tasks_started - ps.tasks_revoked;
966-
ThreadPoolWaiter<kIsHybrid ? 0 : 1> waiter{};
967941
while (ps.tasks_finished < tasks_to_wait_for) {
968-
waiter.wait();
942+
onnxruntime::concurrency::SpinPause();
969943
}
970944

971945
// Clear status to allow the ThreadPoolParallelSection to be
@@ -1283,10 +1257,9 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
12831257
// Increase the worker count if needed. Each worker will pick up
12841258
// loops to execute from the current parallel section.
12851259
std::function<void(unsigned)> worker_fn = [&ps](unsigned par_idx) {
1286-
ThreadPoolWaiter<kIsHybrid ? 4 : 0> waiter{};
12871260
while (ps.active) {
12881261
if (ps.current_loop.load() == nullptr) {
1289-
waiter.wait();
1262+
onnxruntime::concurrency::SpinPause();
12901263
} else {
12911264
ps.workers_in_loop++;
12921265
ThreadPoolLoop* work_item = ps.current_loop;
@@ -1307,9 +1280,8 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
13071280

13081281
// Wait for workers to exit the loop
13091282
ps.current_loop = 0;
1310-
ThreadPoolWaiter<kIsHybrid ? 1 : 4> waiter{};
13111283
while (ps.workers_in_loop) {
1312-
waiter.wait();
1284+
onnxruntime::concurrency::SpinPause();
13131285
}
13141286
profiler_.LogEnd(ThreadPoolProfiler::WAIT);
13151287
}
@@ -1560,30 +1532,13 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
15601532

15611533
assert(td.GetStatus() == WorkerData::ThreadStatus::Spinning);
15621534

1563-
// The exact value of spin_count and steal_count are arbitrary and
1564-
// were experimentally determined. These numbers yielded the best
1565-
// performance across a range of workloads and
1566-
// machines. Generally, the goal of tuning spin_count is to make
1567-
// the number as small as possible while ensuring there is enough
1568-
// slack so that if each core is doing the same amount of work it
1569-
// won't sleep before they have all finished. The idea here is
1570-
// that in pipelined workloads, it won't sleep during each stage
1571-
// if it's done a bit faster than its neighbors, but that if there
1572-
// are non-equal sizes of work distributed, it won't take too long
1573-
// to reach sleep giving power (and thus frequency/performance) to
1574-
// its neighbors. Since hybrid has P/E cores, a lower value is
1575-
// chosen. On hybrid systems, even with equal sized workloads
1576-
// distributed the compute time won't stay synced. Typically in
1577-
// the hybrid case the P cores finish first (and are thus waiting)
1578-
// which is essentially a priority inversion.
1579-
constexpr int pref_spin_count = kIsHybrid ? 5000 : 10000;
1580-
const int spin_count = allow_spinning_ ? pref_spin_count : 0;
1581-
constexpr int steal_count = pref_spin_count / (kIsHybrid ? 25 : 100);
1535+
constexpr int log2_spin = 20;
1536+
const int spin_count = allow_spinning_ ? (1ull << log2_spin) : 0;
1537+
const int steal_count = spin_count / 100;
15821538

15831539
SetDenormalAsZero(set_denormal_as_zero_);
15841540
profiler_.LogThreadId(thread_id);
15851541

1586-
ThreadPoolWaiter<kIsHybrid ? 1 : 8> waiter{};
15871542
while (!should_exit) {
15881543
Task t = q.PopFront();
15891544
if (!t) {
@@ -1599,7 +1554,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter
15991554
if (spin_loop_status_.load(std::memory_order_relaxed) == SpinLoopStatus::kIdle) {
16001555
break;
16011556
}
1602-
waiter.wait();
1557+
onnxruntime::concurrency::SpinPause();
16031558
}
16041559

16051560
// Attempt to block

include/onnxruntime/core/platform/threadpool.h

+2-3
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ struct TensorOpCost {
129129

130130
namespace concurrency {
131131

132-
template <typename Environment, bool kIsHybrid>
132+
template <typename Environment>
133133
class ThreadPoolTempl;
134134

135135
class ExtendedThreadPoolInterface;
@@ -424,8 +424,7 @@ class ThreadPool {
424424
ExtendedThreadPoolInterface* underlying_threadpool_ = nullptr;
425425

426426
// If used, underlying_threadpool_ is instantiated and owned by the ThreadPool.
427-
std::unique_ptr<ThreadPoolTempl<Env, true>> extended_eigen_hybrid_threadpool_;
428-
std::unique_ptr<ThreadPoolTempl<Env, false>> extended_eigen_normal_threadpool_;
427+
std::unique_ptr<ThreadPoolTempl<Env> > extended_eigen_threadpool_;
429428

430429
// Force the thread pool to run in hybrid mode on a normal cpu.
431430
bool force_hybrid_ = false;

include/onnxruntime/core/session/onnxruntime_c_api.h

+19
Original file line numberDiff line numberDiff line change
@@ -4722,6 +4722,25 @@ struct OrtApi {
47224722
* \param[in] adapter OrtLoraAdapter instance
47234723
*/
47244724
ORT_API2_STATUS(RunOptionsAddActiveLoraAdapter, _Inout_ OrtRunOptions* options, _In_ const OrtLoraAdapter* adapter);
4725+
4726+
/// @}
4727+
/// \name OrtEpDynamicOptions
4728+
/// @{
4729+
4730+
/** \brief Set DynamicOptions for EPs (Execution Providers)
4731+
*
4732+
* Valid options can be found in `include\onnxruntime\core\session\onnxruntime_session_options_config_keys.h`
4733+
* Look for `kOrtEpDynamicOptions`
4734+
*
4735+
* \param[in] session
4736+
* \param[in] list of keys represented by null-terminated strings
4737+
* \param[in] list of values represented by null-terminated strings
4738+
* \param[in] number of key-value pairs
4739+
*
4740+
* \since Version 1.20
4741+
*/
4742+
ORT_API2_STATUS(SetEpDynamicOptions, _Inout_ OrtSession* sess, _In_reads_(kv_len) const char* const* keys,
4743+
_In_reads_(kv_len) const char* const* values, _In_ size_t kv_len);
47254744
};
47264745

47274746
/*

include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h

-5
Original file line numberDiff line numberDiff line change
@@ -49,8 +49,3 @@ static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_con
4949
// If the value is set to -1, cuda graph capture/replay is disabled in that run.
5050
// User are not expected to set the value to 0 as it is reserved for internal use.
5151
static const char* const kOrtRunOptionsConfigCudaGraphAnnotation = "gpu_graph_id";
52-
53-
// Specify the type of workload for this run.
54-
// “Default”: OS determines the scheduling priority and processor performance to service this workload. [Default]
55-
// “Efficient”: OS treats this workload is efficiency oriented with low scheduling priority and efficient processor performance.
56-
static const char* const kOrtRunOptionsWorkloadType = "run.workload_type";

include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h

+3-1
Original file line numberDiff line numberDiff line change
@@ -283,7 +283,9 @@ static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas
283283
// If not provided, default is 4.
284284
static const char* const kOrtSessionOptionsQDQMatMulNBitsAccuracyLevel = "session.qdq_matmulnbits_accuracy_level";
285285

286+
// THIS OPTION IS NOT A REGULAR SESSION OPTION SINCE IT CAN BE MODIFIED AT ANY TIME
287+
// Meant to be used with SetEpDynamicOptions
286288
// Specify the type of workload for this session.
287289
// “Default”: OS determines the scheduling priority and processor performance to service this workload. [Default]
288290
// “Efficient”: OS treats this workload is efficiency oriented with low scheduling priority and efficient processor performance.
289-
static const char* const kOrtSessionOptionsWorkloadType = "session.workload_type";
291+
static const char* const kOrtEpDynamicOptionsWorkloadType = "ep.dynamic.workload_type";

0 commit comments

Comments
 (0)