Skip to content

Commit e812aea

Browse files
authored
Merge pull request #789 from intel/sync_msft_25082025
Sync with Microsoft ONNX Runtime - 25/08/2025
2 parents 0bad3d7 + cb59e2d commit e812aea

File tree

109 files changed

+4036
-1466
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

109 files changed

+4036
-1466
lines changed

.github/workflows/mac.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ jobs:
3939
{"machine": "arm64", "target": "arm64", "build_config": "Debug"},
4040
{"machine": "arm64", "target": "arm64", "build_config": "Release"}
4141
]
42+
4243
xnnpack:
4344
uses: ./.github/workflows/macos-ci-build-and-test-workflow.yml
4445
with:
@@ -59,11 +60,13 @@ jobs:
5960
{"machine": "arm64", "target": "arm64", "build_config": "Debug"},
6061
{"machine": "arm64", "target": "arm64", "build_config": "Release"}
6162
]
63+
6264
iphone_simulator:
6365
runs-on: macos-15
6466

6567
env:
6668
xcode_version: 16.4
69+
simulator_runtime_version: 18.5
6770

6871
strategy:
6972
matrix:
@@ -100,6 +103,8 @@ jobs:
100103
--apple_deploy_target=15.1 \
101104
--apple_sysroot=iphonesimulator \
102105
--osx_arch=${{ matrix.target_arch }}
106+
env:
107+
ORT_GET_SIMULATOR_DEVICE_INFO_REQUESTED_RUNTIME_VERSION: ${{ env.simulator_runtime_version }}
103108

104109
Objective-C-StaticAnalysis:
105110
runs-on: macos-14

cmake/CMakeLists.txt

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,8 @@ option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF)
9898

9999
cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
100100
option(onnxruntime_USE_LEAN_ATTENTION "Build lean attention kernel for scaled dot product attention" OFF)
101-
option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON)
101+
cmake_dependent_option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
102+
cmake_dependent_option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" ON "onnxruntime_USE_CUDA" OFF)
102103

103104
option(onnxruntime_BUILD_FOR_NATIVE_MACHINE "Enable this option for turning on optimization specific to this machine" OFF)
104105
option(onnxruntime_USE_AVX "Use AVX instructions" OFF)
@@ -696,6 +697,7 @@ if (onnxruntime_USE_CUDA)
696697
set(onnxruntime_USE_FLASH_ATTENTION OFF)
697698
set(onnxruntime_USE_LEAN_ATTENTION OFF)
698699
set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
700+
set(onnxruntime_USE_FPA_INTB_GEMM OFF)
699701
endif()
700702

701703
if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6)
@@ -708,6 +710,11 @@ if (onnxruntime_USE_CUDA)
708710
set(onnxruntime_USE_FLASH_ATTENTION OFF)
709711
endif()
710712

713+
if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
714+
message( STATUS "FpA IntB Gemm unsupported for CUDA compiler version < 12.0")
715+
set(onnxruntime_USE_FPA_INTB_GEMM OFF)
716+
endif()
717+
711718
if (WIN32)
712719
message( STATUS "Lean Attention unsupported in Windows")
713720
set(onnxruntime_USE_LEAN_ATTENTION OFF)
@@ -736,6 +743,11 @@ if (onnxruntime_USE_CUDA)
736743
message( STATUS "Enable memory efficient attention for CUDA EP")
737744
list(APPEND ORT_PROVIDER_FLAGS -DUSE_MEMORY_EFFICIENT_ATTENTION=1)
738745
endif()
746+
747+
if (onnxruntime_USE_FPA_INTB_GEMM)
748+
message( STATUS "Enable FpA IntB Gemm for CUDA EP")
749+
list(APPEND ORT_PROVIDER_FLAGS -DUSE_FPA_INTB_GEMM=1)
750+
endif()
739751
endif()
740752

741753
if (onnxruntime_USE_CUDA_INTERFACE AND (NOT onnxruntime_USE_CUDA))

cmake/external/onnxruntime_external_deps.cmake

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,13 @@ if (onnxruntime_ENABLE_CPUINFO)
331331
set(CPUINFO_SUPPORTED TRUE)
332332
endif()
333333
if (WIN32)
334-
set(CPUINFO_SUPPORTED TRUE)
334+
# There's an error when linking with cpuinfo on arm64ec with a vcpkg build (--use_vcpkg).
335+
# TODO Fix it and then re-enable cpuinfo on arm64ec.
336+
if (onnxruntime_target_platform STREQUAL "ARM64EC")
337+
set(CPUINFO_SUPPORTED FALSE)
338+
else()
339+
set(CPUINFO_SUPPORTED TRUE)
340+
endif()
335341
elseif (NOT ${onnxruntime_target_platform} MATCHES "^(i[3-6]86|AMD64|x86(_64)?|armv[5-8].*|aarch64|arm64)$")
336342
message(WARNING
337343
"Target processor architecture \"${onnxruntime_target_platform}\" is not supported in cpuinfo. "
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
diff --git a/include/cpuinfo.h b/include/cpuinfo.h
2+
index f1d35d4..9e454d2 100644
3+
--- a/include/cpuinfo.h
4+
+++ b/include/cpuinfo.h
5+
@@ -18,7 +18,7 @@
6+
#define CPUINFO_ARCH_X86 1
7+
#endif
8+
9+
-#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
10+
+#if defined(__x86_64__) || defined(__x86_64) || (defined(_M_X64) && !defined(_M_ARM64EC)) || (defined(_M_AMD64) && !defined(_M_ARM64EC))
11+
#define CPUINFO_ARCH_X86_64 1
12+
#endif
13+
14+
@@ -26,7 +26,7 @@
15+
#define CPUINFO_ARCH_ARM 1
16+
#endif
17+
18+
-#if defined(__aarch64__) || defined(_M_ARM64)
19+
+#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
20+
#define CPUINFO_ARCH_ARM64 1
21+
#endif
22+

cmake/vcpkg-ports/cpuinfo/portfile.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@ vcpkg_from_github(
99
REF de0ce7c7251372892e53ce9bc891750d2c9a4fd8
1010
SHA512 0fde9210b700d2648d37c8deeb0d5c0d007d8ca5689578dd3bce4c460886b20d7649f0194d2ea06b02238fe9d4f06193599ec3ab5cafb19f1f860b00404264fa
1111
HEAD_REF master
12+
PATCHES
13+
patch_cpuinfo_h_for_arm64ec.patch
1214
)
1315

1416
vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS

docs/OperatorKernels.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -703,7 +703,10 @@ Do not modify directly.*
703703
|GreaterOrEqual|*in* A:**T**<br> *in* B:**T**<br> *out* C:**T1**|16+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T1** = tensor(bool)|
704704
|||[12, 15]|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)<br/> **T1** = tensor(bool)|
705705
|GridSample|*in* X:**T1**<br> *in* grid:**T2**<br> *out* Y:**T1**|16+|**T1** = tensor(float)<br/> **T2** = tensor(float)|
706-
|HardSigmoid|*in* X:**T**<br> *out* Y:**T**|6+|**T** = tensor(double), tensor(float), tensor(float16)|
706+
|HardSigmoid|*in* X:**T**<br> *out* Y:**T**|22+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
707+
|||[6, 21]|**T** = tensor(double), tensor(float), tensor(float16)|
708+
|HardSwish|*in* X:**T**<br> *out* Y:**T**|22+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
709+
|||[14, 21]|**T** = tensor(double), tensor(float), tensor(float16)|
707710
|Identity|*in* input:**T**<br> *out* output:**T**<br><br>or<br><br>*in* input:**V**<br> *out* output:**V**|19+|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(float8e4m3fn)), seq(tensor(float8e4m3fnuz)), seq(tensor(float8e5m2)), seq(tensor(float8e5m2fnuz)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
708711
|||[14, 18]|**V** = seq(tensor(bfloat16)), seq(tensor(bool)), seq(tensor(double)), seq(tensor(float)), seq(tensor(float16)), seq(tensor(int16)), seq(tensor(int32)), seq(tensor(int64)), seq(tensor(int8)), seq(tensor(uint16)), seq(tensor(uint32)), seq(tensor(uint64)), seq(tensor(uint8)), tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|
709712
|||13|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)|

include/onnxruntime/core/framework/execution_provider.h

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ class GraphOptimizerRegistry;
3636
#include "core/framework/framework_provider_common.h"
3737
#include "core/framework/stream_handles.h"
3838
#include "core/framework/tuning_context.h"
39+
#include "core/session/onnxruntime_c_api.h"
3940

4041
struct OrtEpDevice;
4142
struct OrtRunOptions;
@@ -322,6 +323,29 @@ class IExecutionProvider {
322323
virtual common::Status Compile(const std::vector<FusedNodeAndGraph>& fused_nodes_and_graphs,
323324
std::vector<NodeComputeInfo>& node_compute_funcs);
324325

326+
/**
327+
* Get the compatibility info for a compiled model.
328+
*
329+
* The execution provider determines this value, which denotes the compatibility of the compiled model with the EP.
330+
* This is stored in the model metadata under a key associated with the EP type.
331+
*/
332+
virtual std::string GetCompiledModelCompatibilityInfo(const onnxruntime::GraphViewer& graph_viewer) const {
333+
// graph_viewer and model_metadata are not used in the default implementation.
334+
ORT_UNUSED_PARAMETER(graph_viewer);
335+
// Default implementation returns empty string
336+
return std::string();
337+
}
338+
339+
/**
340+
* Validate the compatibility of a compiled model with this execution provider.
341+
*/
342+
virtual common::Status ValidateCompiledModelCompatibilityInfo(const std::string& /*compatibility_info*/,
343+
OrtCompiledModelCompatibility& model_compatibility) const {
344+
// Default implementation indicates this EP does not support model compatibility validation
345+
model_compatibility = OrtCompiledModelCompatibility_EP_NOT_APPLICABLE;
346+
return Status::OK();
347+
}
348+
325349
#endif
326350

327351
void SetLogger(const logging::Logger* logger) {

include/onnxruntime/core/graph/indexed_sub_graph.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ struct IndexedSubGraph {
3131
std::string domain; ///< Domain of customized SubGraph/FunctionProto
3232
int since_version; ///< Since version of customized SubGraph/FunctionProto.
3333

34-
ONNX_NAMESPACE::OperatorStatus status; ///< Status of customized SubGraph/FunctionProto.
34+
ONNX_NAMESPACE::OperatorStatus status{ONNX_NAMESPACE::OperatorStatus::STABLE}; ///< Status of customized SubGraph/FunctionProto.
3535

3636
std::vector<std::string> inputs; ///< Inputs of customized SubGraph/FunctionProto.
3737
std::vector<std::string> outputs; ///< Outputs of customized SubGraph/FunctionProto.

include/onnxruntime/core/providers/nv_tensorrt_rtx/nv_provider_options.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,8 @@ constexpr const char* kProfilesMinShapes = "nv_profile_min_shapes";
3232
constexpr const char* kProfilesMaxShapes = "nv_profile_max_shapes";
3333
constexpr const char* kProfilesOptShapes = "nv_profile_opt_shapes";
3434
constexpr const char* kCudaGraphEnable = "nv_cuda_graph_enable";
35-
constexpr const char* kONNXBytestream = "nv_onnx_bytestream";
36-
constexpr const char* kONNXBytestreamSize = "nv_onnx_bytestream_size";
3735
constexpr const char* kMultiProfileEnable = "nv_multi_profile_enable";
36+
constexpr const char* kUseExternalDataInitializer = "nv_use_external_data_initializer";
3837

3938
} // namespace provider_option_names
4039
namespace run_option_names {

include/onnxruntime/core/session/onnxruntime_c_api.h

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5829,7 +5829,7 @@ struct OrtApi {
58295829
*
58305830
* \since Version 1.23.
58315831
*/
5832-
ORT_API2_STATUS(Graph_GetNodes, const OrtGraph* graph,
5832+
ORT_API2_STATUS(Graph_GetNodes, _In_ const OrtGraph* graph,
58335833
_Out_writes_(num_nodes) const OrtNode** nodes, _In_ size_t num_nodes);
58345834

58355835
/** \brief Get the parent node for the given graph, if any exists.
@@ -6469,6 +6469,17 @@ struct OrtApi {
64696469
_In_reads_(num_tensors) OrtValue* const* dst_tensors,
64706470
_In_opt_ OrtSyncStream* stream,
64716471
_In_ size_t num_tensors);
6472+
6473+
/** \brief Get ::OrtModelMetadata from an ::OrtGraph
6474+
*
6475+
* \param[in] graph The OrtGraph instance.
6476+
* \param[out] out Newly created ::OrtModelMetadata. Must be freed using OrtApi::ReleaseModelMetadata.
6477+
*
6478+
* \snippet{doc} snippets.dox OrtStatus Return Value
6479+
*
6480+
* \since Version 1.23.
6481+
*/
6482+
ORT_API2_STATUS(Graph_GetModelMetadata, _In_ const OrtGraph* graph, _Outptr_ OrtModelMetadata** out);
64726483
};
64736484

64746485
/*

0 commit comments

Comments
 (0)