Skip to content

Commit

Permalink
ORT 1.19.0 Release: Cherry-Pick Round 1 (#21619)
Browse files Browse the repository at this point in the history
### Description
<!-- Describe your changes. -->

PRs marked for cherry-pick.

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->

ORT 1.19.0 Release Preparation

---------

Signed-off-by: Liqun Fu <[email protected]>
Signed-off-by: liqunfu <[email protected]>
Signed-off-by: Liqun Fu <[email protected]>
Co-authored-by: liqun Fu <[email protected]>
Co-authored-by: Jing Fang <[email protected]>
Co-authored-by: Tianlei Wu <[email protected]>
Co-authored-by: Adrian Lizarraga <[email protected]>
Co-authored-by: Changming Sun <[email protected]>
Co-authored-by: Sumit Agarwal <[email protected]>
Co-authored-by: vraspar <[email protected]>
Co-authored-by: Scott McKay <[email protected]>
Co-authored-by: Edward Chen <[email protected]>
Co-authored-by: Yi Zhang <[email protected]>
Co-authored-by: jingyanwangms <[email protected]>
Co-authored-by: Yi Zhang <[email protected]>
Co-authored-by: Chi Lo <[email protected]>
Co-authored-by: saurabh <[email protected]>
Co-authored-by: sfatimar <[email protected]>
  • Loading branch information
16 people authored Aug 12, 2024
1 parent ee2fe87 commit ccf6a28
Show file tree
Hide file tree
Showing 125 changed files with 11,352 additions and 1,689 deletions.
2 changes: 1 addition & 1 deletion .pipelines/nuget_config/x64/packages.config
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="python" version="3.9.7" targetFramework="native" />
<package id="Microsoft.AI.DirectML" version="1.15.0" targetFramework="native" />
<package id="Microsoft.AI.DirectML" version="1.15.1" targetFramework="native" />
<package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
</packages>
2 changes: 1 addition & 1 deletion .pipelines/nuget_config/x86/packages.config
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<packages>
<package id="pythonx86" version="3.9.7" targetFramework="native" />
<package id="Microsoft.AI.DirectML" version="1.15.0" targetFramework="native" />
<package id="Microsoft.AI.DirectML" version="1.15.1" targetFramework="native" />
<package id="Microsoft.Windows.CppWinRT" version="2.0.201201.7" targetFramework="native" />
</packages>
2 changes: 1 addition & 1 deletion cmake/external/dml.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ if (NOT onnxruntime_USE_CUSTOM_DIRECTML)
set(NUGET_CONFIG ${PROJECT_SOURCE_DIR}/../NuGet.config)
set(PACKAGES_CONFIG ${PROJECT_SOURCE_DIR}/../packages.config)
get_filename_component(PACKAGES_DIR ${CMAKE_CURRENT_BINARY_DIR}/../packages ABSOLUTE)
set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.15.0)
set(DML_PACKAGE_DIR ${PACKAGES_DIR}/Microsoft.AI.DirectML.1.15.1)

# Restore nuget packages, which will pull down the DirectML redist package.
add_custom_command(
Expand Down
12 changes: 8 additions & 4 deletions cmake/onnxruntime.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,14 @@ function(get_c_cxx_api_headers HEADERS_VAR)

# need to add header files for enabled EPs
foreach(f ${ONNXRUNTIME_PROVIDER_NAMES})
file(GLOB _provider_headers CONFIGURE_DEPENDS
"${REPO_ROOT}/include/onnxruntime/core/providers/${f}/*.h"
)
list(APPEND _headers ${_provider_headers})
# The header files in include/onnxruntime/core/providers/cuda directory cannot be flattened to the same directory
# with onnxruntime_c_api.h . Most other EPs probably also do not work in this way.
if((NOT f STREQUAL cuda) AND (NOT f STREQUAL rocm))
file(GLOB _provider_headers CONFIGURE_DEPENDS
"${REPO_ROOT}/include/onnxruntime/core/providers/${f}/*.h"
)
list(APPEND _headers ${_provider_headers})
endif()
endforeach()

set(${HEADERS_VAR} ${_headers} PARENT_SCOPE)
Expand Down
13 changes: 11 additions & 2 deletions cmake/onnxruntime_mlas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -555,8 +555,17 @@ else()
${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
)
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")

message(STATUS "CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")
message(STATUS "CMAKE_CXX_COMPILER_VERSION: ${CMAKE_CXX_COMPILER_VERSION}")

if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" OR CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "10")
message(STATUS "Using -mavx2 -mfma -mavxvnni flags")
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma -mavxvnni")
else()
message(STATUS "Using -mavx2 -mfma flags")
set_source_files_properties(${mlas_platform_srcs_avx2} PROPERTIES COMPILE_FLAGS "-mavx2 -mfma")
endif()
set(mlas_platform_srcs_avx512f
${MLAS_SRC_DIR}/x86_64/DgemmKernelAvx512F.S
${MLAS_SRC_DIR}/x86_64/SgemmKernelAvx512F.S
Expand All @@ -575,7 +584,7 @@ else()
${MLAS_SRC_DIR}/x86_64/ConvSymKernelAvx512Core.S
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
)
set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mavx512bw -mavx512dq -mavx512vl")
set_source_files_properties(${mlas_platform_srcs_avx512core} PROPERTIES COMPILE_FLAGS "-mfma -mavx512vnni -mavx512bw -mavx512dq -mavx512vl")

set(mlas_platform_srcs_avx512vnni
${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
Expand Down
1 change: 1 addition & 0 deletions cmake/onnxruntime_providers_cpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,7 @@ if (onnxruntime_ENABLE_TRAINING)
endif()

install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/cpu/cpu_provider_factory.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/)
install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/resource.h ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/custom_op_context.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers)
set_target_properties(onnxruntime_providers PROPERTIES LINKER_LANGUAGE CXX)
set_target_properties(onnxruntime_providers PROPERTIES FOLDER "ONNXRuntime")

Expand Down
9 changes: 8 additions & 1 deletion cmake/onnxruntime_providers_cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -289,8 +289,15 @@
config_cuda_provider_shared_module(onnxruntime_providers_cuda_obj)
endif()
config_cuda_provider_shared_module(onnxruntime_providers_cuda)

# Cannot use glob because the file cuda_provider_options.h should not be exposed out.
set(ONNXRUNTIME_CUDA_PROVIDER_PUBLIC_HEADERS
"${REPO_ROOT}/include/onnxruntime/core/providers/cuda/cuda_context.h"
"${REPO_ROOT}/include/onnxruntime/core/providers/cuda/cuda_resource.h"
)
set_target_properties(onnxruntime_providers_cuda PROPERTIES
PUBLIC_HEADER "${ONNXRUNTIME_CUDA_PROVIDER_PUBLIC_HEADERS}")
install(TARGETS onnxruntime_providers_cuda
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers/cuda
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
7 changes: 6 additions & 1 deletion cmake/onnxruntime_providers_rocm.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,13 @@
if (onnxruntime_ENABLE_ATEN)
target_compile_definitions(onnxruntime_providers_rocm PRIVATE ENABLE_ATEN)
endif()

file(GLOB ONNXRUNTIME_ROCM_PROVIDER_PUBLIC_HEADERS CONFIGURE_DEPENDS
"${REPO_ROOT}/include/onnxruntime/core/providers/rocm/*.h"
)
set_target_properties(onnxruntime_providers_rocm PROPERTIES
PUBLIC_HEADER "${ONNXRUNTIME_ROCM_PROVIDER_PUBLIC_HEADERS}")
install(TARGETS onnxruntime_providers_rocm
PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core/providers/rocm
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<?xml version="1.0" encoding="utf-8"?>
<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Condition="('$(OutputType)'!='Library' OR '$(IsAppExtension)'=='True')">
<NativeReference Include="$(MSBuildThisFileDirectory)..\..\runtimes\ios\native\onnxruntime.xcframework">
<NativeReference Include="$(MSBuildThisFileDirectory)..\..\runtimes\ios\native\onnxruntime.xcframework.zip">
<Kind>Static</Kind>
<IsCxx>True</IsCxx>
<SmartLink>True</SmartLink>
Expand All @@ -10,4 +10,4 @@
<WeakFrameworks>CoreML</WeakFrameworks>
</NativeReference>
</ItemGroup>
</Project>
</Project>
9 changes: 7 additions & 2 deletions include/onnxruntime/core/optimizer/graph_transformer_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@

#pragma once

#include <string>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include "core/common/inlined_containers.h"
#include "core/framework/session_options.h"
#include "core/framework/tensor.h"
#include "core/optimizer/graph_transformer.h"
#include "core/platform/threadpool.h"

Expand Down Expand Up @@ -51,7 +54,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformers(
const SessionOptions& session_options,
const IExecutionProvider& execution_provider /*required by constant folding*/,
const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
concurrency::ThreadPool* intra_op_thread_pool = nullptr);
concurrency::ThreadPool* intra_op_thread_pool = nullptr,
std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);

#endif // !defined(ORT_MINIMAL_BUILD)

Expand Down Expand Up @@ -81,7 +85,8 @@ InlinedVector<std::unique_ptr<GraphTransformer>> GenerateTransformersForMinimalB
const SatApplyContextVariant& apply_context,
const IExecutionProvider& cpu_execution_provider,
const InlinedHashSet<std::string>& rules_and_transformers_to_disable = {},
concurrency::ThreadPool* intra_op_thread_pool = nullptr);
concurrency::ThreadPool* intra_op_thread_pool = nullptr,
std::unordered_map<std::string, std::unique_ptr<Tensor>>* p_buffered_tensors = nullptr);

#endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ struct OrtTensorRTProviderOptionsV2 {
// can be updated using: UpdateTensorRTProviderOptionsWithValue
int trt_max_partition_iterations{1000}; // maximum iterations for TensorRT parser to get capability
int trt_min_subgraph_size{1}; // minimum size of TensorRT subgraphs
size_t trt_max_workspace_size{1 << 30}; // maximum workspace size for TensorRT.
size_t trt_max_workspace_size{0}; // maximum workspace size for TensorRT. Default is 0 means max device memory size
int trt_fp16_enable{0}; // enable TensorRT FP16 precision. Default 0 = false, nonzero = true
int trt_int8_enable{0}; // enable TensorRT INT8 precision. Default 0 = false, nonzero = true
const char* trt_int8_calibration_table_name{nullptr}; // TensorRT INT8 calibration table name.
Expand Down
1 change: 0 additions & 1 deletion onnxruntime/contrib_ops/cpu/bert/attention_base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,6 @@ Status AttentionBase::CheckInputs(const TensorShape& input_shape,
output_parameters->scale = scale_;
output_parameters->mask_type = mask_type;
output_parameters->broadcast_res_pos_bias = broadcast_res_pos_bias;
output_parameters->pass_past_in_kv = false;
output_parameters->qkv_format = Q_K_V_BNSH;
}

Expand Down
13 changes: 10 additions & 3 deletions onnxruntime/contrib_ops/cpu/bert/attention_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@
namespace onnxruntime {
namespace contrib {

enum AttentionType {
kAttention,
kMultiHeadAttention,
kDecoderMaskedMultiHeadAttention,
};

enum AttentionMaskType {
MASK_NONE, // No mask
MASK_1D_KEY_SEQ_LEN, // [batch_size], key sequence length
Expand All @@ -24,10 +30,12 @@ enum AttentionQkvFormat {
UNKNOWN, // enum value not set, or depends on qkv projection implementation details
Q_K_V_BNSH, // for non-packed qkv, permuted
Q_K_V_BSNH, // for non-packed qkv, not permuted, used by memory efficient attention or MultiHeadAttention
QKV_BSN3H, // for TRT fused attention, qkv are packed
Q_K_V_BSNH_BNSH_BNSH, // for cross attention, k and v are permuted
Q_K_V_BNSH_QKV_BS3NH, // for TRT fused causal attention, data has two formats (qkv is 3BNSH, gemm_buffer is BS3NH)
Q_KV_BSNH_BSN2H, // for TRT fused cross attention, kv are packed
Q_K_V_TNH, // for memory efficient attention, qkv are not packed, and paddings are removed.
Q_KV_BSNH_BSN2H, // for TRT fused cross attention, kv are packed
QKV_BSN3H, // for TRT fused attention, qkv are packed
QKV_BS3NH, // for DecoderMaskedMultiHeadAttention, qkv are packed
QKV_TN3H, // for TRT fused attention, qkv are packed and paddings are removed
};

Expand Down Expand Up @@ -61,7 +69,6 @@ struct AttentionParameters {
bool past_present_share_buffer;
bool do_rotary;
bool broadcast_res_pos_bias;
bool pass_past_in_kv;
float mask_filter_value;
float scale;
bool use_tf32;
Expand Down
15 changes: 4 additions & 11 deletions onnxruntime/contrib_ops/cpu/bert/multihead_attention.cc
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ Status MultiHeadAttention<T>::Compute(OpKernelContext* context) const {
scale_,
is_unidirectional_,
past_present_share_buffer,
false));
kMultiHeadAttention));

const int batch_size = parameters.batch_size;
const int q_sequence_length = parameters.sequence_length;
Expand Down Expand Up @@ -121,20 +121,13 @@ Status MultiHeadAttention<T>::Compute(OpKernelContext* context) const {
AllocatorPtr allocator;
ORT_RETURN_IF_ERROR(context->GetTempSpaceAllocator(&allocator));

// For each of Q/K/V, there are multiple scenarios:
// 1) Combined QKV bias is null
// a) Q/K/V is (B, S, D)
// b) Q/K/V is (B, S, N, H)
// 2) No packed QKV in Q
// a) Q/K/V has seq_len = 1
// b) Q/K/V has seq_len > 1

OrtValue Q;
ORT_RETURN_IF_ERROR(MaybeTransposeToBNSHAndAddBias<T>(
context, allocator, batch_size, num_heads_, q_sequence_length, qk_head_size, query, bias, q_bias_offset, Q));

if (parameters.pass_past_in_kv) { // key and value in BNSH format
assert(bias == nullptr);
if (parameters.qkv_format == Q_K_V_BSNH_BNSH_BNSH) {
// For cross attention with k and v in BNSH format, we assume that bias for key and value are zeros.
// So we don't need to add bias for key and value here.
assert(past_key == nullptr);
assert(past_value == nullptr);
return ApplyAttention(Q.GetMutable<Tensor>()->MutableData<T>(),
Expand Down
Loading

0 comments on commit ccf6a28

Please sign in to comment.