Skip to content
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF)
option(onnxruntime_USE_ARM_NEON_NCHWC "Build with ARM Neon NCHWc kernels in MLAS" OFF)

option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF)
option(onnxruntime_USE_QMX_KLEIDIAI_COEXIST "Build with QMX and Arm KLEIDIAI libraries" OFF)
option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
option(onnxruntime_BUILD_OBJC "Build Objective-C library" OFF)
Expand Down
3 changes: 3 additions & 0 deletions cmake/deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,7 @@ directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v
cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.12.0.zip;7e733cfdc410d777b76122d64232499205589a96
dawn;https://github.com/google/dawn/archive/13c1635a14574ebb7116b56a69f5519301417fda.zip;0aadd28fc385cf7d657d5fc70a352372d2d3c76a
kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.15.0.tar.gz;62ccd24ab60bcef68766440fb42d79071ac2a5d2
# kleidiai-qmx is pinned to a specific commit as there are no tagged releases. When an appropriate tagged release becomes available,
# this entry will be updated to use refs/tags/<version> instead of the raw commit hash.
kleidiai-qmx;https://github.com/qualcomm/kleidiai/archive/2f10c9a8d32f81ffeeb6d4885a29cc35d2b0da87.zip;5e855730a2d69057a569f43dd7532db3b2d2a05c
duktape;https://github.com/svaarala/duktape/releases/download/v2.7.0/duktape-2.7.0.tar.xz;8200c8e417dbab7adcc12c4dbdef7651cfc55794
6 changes: 6 additions & 0 deletions cmake/external/onnxruntime_external_deps.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,12 @@ if(onnxruntime_USE_KLEIDIAI)

onnxruntime_fetchcontent_declare(kleidiai URL ${DEP_URL_kleidiai} URL_HASH SHA1=${DEP_SHA1_kleidiai} EXCLUDE_FROM_ALL)
onnxruntime_fetchcontent_makeavailable(kleidiai)
# Fetch Qualcomm's kleidiai library
if(onnxruntime_USE_QMX_KLEIDIAI_COEXIST)
onnxruntime_fetchcontent_declare(kleidiai-qmx URL ${DEP_URL_kleidiai-qmx} URL_HASH SHA1=${DEP_SHA1_kleidiai-qmx}
EXCLUDE_FROM_ALL)
onnxruntime_fetchcontent_makeavailable(kleidiai-qmx)
endif()
endif()

set(onnxruntime_LINK_DIRS)
Expand Down
13 changes: 13 additions & 0 deletions cmake/onnxruntime_mlas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,11 @@ function(setup_kleidiai)
)
target_link_libraries(onnxruntime_mlas PRIVATE kleidiai)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai)
if(onnxruntime_USE_QMX_KLEIDIAI_COEXIST)
target_link_libraries(onnxruntime_mlas PRIVATE kleidiai-qmx)
target_compile_definitions(onnxruntime_mlas PRIVATE ENABLE_QMX_KERNELS=1)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai-qmx)
endif()
set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES} PARENT_SCOPE)

# If KLEIDIAI_DEBUG is enabled that implies both DEBUG and KERNEL messages.
Expand All @@ -302,6 +307,14 @@ function(setup_kleidiai)
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()

if(onnxruntime_USE_QMX_KLEIDIAI_COEXIST)
install(TARGETS kleidiai-qmx EXPORT ${PROJECT_NAME}Targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()
endfunction()

function (setup_arm_neon_nchwc)
Expand Down
11 changes: 0 additions & 11 deletions onnxruntime/core/mlas/inc/mlas.h
Original file line number Diff line number Diff line change
Expand Up @@ -2115,14 +2115,3 @@ MlasFlashAttention(
MlasFlashAttentionThreadedArgs* args,
MLAS_THREADPOOL* ThreadPool
);

#if defined(USE_KLEIDIAI)
/**
* @brief Function to override the packing mechanism decision if kleidi ai is included
* @param enable enable kleidiai packing (allow or disallow depending on true/false)
* @return
*/
void
MLASCALL
MlasGemmBatchPackUseKleidi(bool enable);
#endif
28 changes: 28 additions & 0 deletions onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
#include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa.h"
#if defined(ENABLE_QMX_KERNELS)
#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa.h"
#endif // ENABLE_QMX_KERNELS

const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod =
{kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
Expand Down Expand Up @@ -122,6 +125,21 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_sme2 =
kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa,
kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa};

#if defined(ENABLE_QMX_KERNELS)
const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_qmx =
{kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_mr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_nr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_kr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_dst_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa};
#endif // ENABLE_QMX_KERNELS

const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemmUKernel() {
if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeon_I8MM()) {
return kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm;
Expand All @@ -142,7 +160,17 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel& GetKleidiAISGemmUKernel() {
if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2()) {
return sgemm_gemm_sme2;
} else {
#if defined(ENABLE_QMX_KERNELS)
if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
{
KLEIDIAI_KERNEL_LOG("SGEMM: Using QMX Kernel");
return sgemm_gemm_qmx;
} else {
return sgemm_gemm_sme;
}
#else
return sgemm_gemm_sme;
#endif // ENABLE_QMX_KERNELS
}
}

Expand Down
32 changes: 27 additions & 5 deletions onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
#include "kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa.h"
#include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.h"
#include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.h"
#if defined(ENABLE_QMX_KERNELS)
#include "kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_qmx_mopa.h"
#endif // ENABLE_QMX_KERNELS


// Right-hand-side (weights) cache key
struct RhsCacheKey {
Expand Down Expand Up @@ -596,11 +600,29 @@ static void ConvolveSme(const size_t co, //channels out
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
} else {
KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci);
kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(
TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
#if defined(ENABLE_QMX_KERNELS)
if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
{
KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_qmx_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci);
kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_qmx_mopa(
TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
else {
KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci);
kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(
TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
#else
KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci);
kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(
TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
#endif // ENABLE_QMX_KERNELS
}
});

Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ namespace ArmKleidiAI {

// By default we should try for SME2 first before falling back to SME.
inline const bool UseSME2 = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2();
inline const bool UseSME = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME();
inline const std::string_view vendor_name = MLAS_CPUIDINFO::GetCPUIDInfo().GetCPUVendor();

// Buffer packing routines.
//
Expand Down
59 changes: 51 additions & 8 deletions onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
#include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.h"

#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.h"
#if defined(ENABLE_QMX_KERNELS)
#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa.h"
#endif // ENABLE_QMX_KERNELS

#include "mlasi_kleidiai.h"

Expand Down Expand Up @@ -108,14 +112,53 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
<< " M="<< Shape.M << " K=" << Shape.K << " mr=" << mr << " kr=" << kr << " sr=" << sr << " m_idx_start=0");
kai_run_lhs_quant_pack_qai8dxp_f32(Shape.M, Shape.K, mr, kr, sr, 0, DataParams->A,
Shape.K*sizeof(float), lhs);
if (ArmKleidiAI::UseSME2)
{
KLEIDIAI_KERNEL_LOG("kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa");
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(
Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
DataParams->C,
Shape.N * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
else {

#if defined(ENABLE_QMX_KERNELS)
if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
{
KLEIDIAI_KERNEL_LOG("kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa");
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa(
Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
DataParams->C,
Shape.N * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);

}
else {
KLEIDIAI_KERNEL_LOG("kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa");
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(
Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
DataParams->C,
Shape.N * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
#else
KLEIDIAI_KERNEL_LOG("kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa");
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(
Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
DataParams->C,
Shape.N * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
#endif // ENABLE_QMX_KERNELS

KLEIDIAI_KERNEL_LOG("kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa");
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(
Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
DataParams->C,
Shape.N * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
}
}
13 changes: 8 additions & 5 deletions onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#include "mlasi_kleidiai.h"
#include "kai_ukernel_interface.h"

#if defined(ENABLE_QMX_KERNELS)
#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa.h"
#endif // ENABLE_QMX_KERNELS

// Thread-local reusable buffers to reduce allocation overhead across tiles.
struct KaiTlsBuffers {
Expand Down Expand Up @@ -145,9 +148,9 @@ ArmKleidiAI::MlasGemvBatch(
if (M != 1 && N != 1) {
return false;
}

const bool m_path = (M == 1);

// We cannot support cases where N == 1 and B is already packed.
// When both are 1, we route through the M-path, so this naturally doesn't trigger.
if (!m_path && Data->BIsPacked) {
Expand All @@ -165,15 +168,15 @@ ArmKleidiAI::MlasGemvBatch(
// - M-path: LHS is A, stride = lda
// - N-path: LHS is B, stride = ldb
size_t lhs_ld = m_path ? Data[b].lda : Data[b].ldb;

const float* rhs_base = m_path ? static_cast<const float*>(Data[b].B)
: static_cast<const float*>(Data[b].A);
const float* lhs_base = m_path ? static_cast<const float*>(Data[b].A)
const float* lhs_base = m_path ? static_cast<const float*>(Data[b].A)
: static_cast<const float*>(Data[b].B);

// Prepare packed RHS if needed
const void* rhs_packed_ptr = nullptr;

// The if branch can only be taken in cases where we are dealing with M == 1
// We previously reject any prepacked B where N == 1
// In cases where N == 1 we Pack A Matrix as the RHS using tb = CBlasTrans
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/mlas/lib/qgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ MLASCALL
MlasIsDynamicQGemmAvailable()
{
#if defined(USE_KLEIDIAI)
return ArmKleidiAI::UseSME2;
return (ArmKleidiAI::UseSME2 || ArmKleidiAI::UseSME);
#else
return false;
#endif
Expand Down
2 changes: 2 additions & 0 deletions tools/ci_build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -896,6 +896,8 @@ def generate_build_tree(

if not args.no_kleidiai:
cmake_args += ["-Donnxruntime_USE_KLEIDIAI=ON"]
if args.use_qmx:
cmake_args += ["-Donnxruntime_USE_QMX_KLEIDIAI_COEXIST=ON"]

if args.enable_arm_neon_nchwc:
cmake_args += ["-Donnxruntime_USE_ARM_NEON_NCHWC=ON"]
Expand Down
6 changes: 6 additions & 0 deletions tools/ci_build/build_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -763,6 +763,12 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
"--no_kleidiai", action="store_true", help="Disable KleidiAI integration (used with ACL/ArmNN)."
)

# --- Qualcomm QMX Library ---
qmx_group = parser.add_argument_group("QMX kernel library")
qmx_group.add_argument(
"--use_qmx", action="store_true", help="Enable Qualcomm QMX kernel to coexist with Arm KleidiAI."
)

# --- RKNPU ---
rknpu_group = parser.add_argument_group("RKNPU Execution Provider")
rknpu_group.add_argument("--use_rknpu", action="store_true", help="Enable RKNPU EP.")
Expand Down
Loading