Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF)
option(onnxruntime_USE_ARM_NEON_NCHWC "Build with ARM Neon NCHWc kernels in MLAS" OFF)

option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF)
option(onnxruntime_USE_QMX_KLEIDIAI_COEXIST "Build with QMX and Arm KLEIDIAI libraries" OFF)
option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON)
option(onnxruntime_BUILD_CSHARP "Build C# library" OFF)
option(onnxruntime_BUILD_OBJC "Build Objective-C library" OFF)
Expand Down
3 changes: 3 additions & 0 deletions cmake/deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,7 @@ directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v
cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.12.0.zip;7e733cfdc410d777b76122d64232499205589a96
dawn;https://github.com/google/dawn/archive/13c1635a14574ebb7116b56a69f5519301417fda.zip;0aadd28fc385cf7d657d5fc70a352372d2d3c76a
kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.20.0.tar.gz;6895e72b3d5cf1173358164cb3d64c9d7d33cc84
# kleidiai-qmx is pinned to a specific commit as there are no tagged releases. When an appropriate tagged release becomes available,
# this entry will be updated to use refs/tags/<version> instead of the raw commit hash.
kleidiai-qmx;https://github.com/qualcomm/kleidiai/archive/2f10c9a8d32f81ffeeb6d4885a29cc35d2b0da87.zip;5e855730a2d69057a569f43dd7532db3b2d2a05c
duktape;https://github.com/svaarala/duktape/releases/download/v2.7.0/duktape-2.7.0.tar.xz;8200c8e417dbab7adcc12c4dbdef7651cfc55794
6 changes: 6 additions & 0 deletions cmake/external/onnxruntime_external_deps.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,12 @@ if(onnxruntime_USE_KLEIDIAI)

onnxruntime_fetchcontent_declare(kleidiai URL ${DEP_URL_kleidiai} URL_HASH SHA1=${DEP_SHA1_kleidiai} EXCLUDE_FROM_ALL)
onnxruntime_fetchcontent_makeavailable(kleidiai)
# Fetch Qualcomm's kleidiai library
if(onnxruntime_USE_QMX_KLEIDIAI_COEXIST)
onnxruntime_fetchcontent_declare(kleidiai-qmx URL ${DEP_URL_kleidiai-qmx} URL_HASH SHA1=${DEP_SHA1_kleidiai-qmx}
EXCLUDE_FROM_ALL)
onnxruntime_fetchcontent_makeavailable(kleidiai-qmx)
endif()
endif()

set(onnxruntime_LINK_DIRS)
Expand Down
13 changes: 13 additions & 0 deletions cmake/onnxruntime_mlas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,11 @@ function(setup_kleidiai)
)
target_link_libraries(onnxruntime_mlas PRIVATE kleidiai)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai)
if(onnxruntime_USE_QMX_KLEIDIAI_COEXIST)
target_link_libraries(onnxruntime_mlas PRIVATE kleidiai-qmx)
target_compile_definitions(onnxruntime_mlas PRIVATE ENABLE_QMX_KERNELS=1)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai-qmx)
endif()
set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES} PARENT_SCOPE)

# If KLEIDIAI_DEBUG is enabled that implies both DEBUG and KERNEL messages.
Expand All @@ -307,6 +312,14 @@ function(setup_kleidiai)
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()

if(onnxruntime_USE_QMX_KLEIDIAI_COEXIST)
install(TARGETS kleidiai-qmx EXPORT ${PROJECT_NAME}Targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()
endfunction()

function (setup_arm_neon_nchwc)
Expand Down
11 changes: 0 additions & 11 deletions onnxruntime/core/mlas/inc/mlas.h
Original file line number Diff line number Diff line change
Expand Up @@ -2125,14 +2125,3 @@ MlasFlashAttention(
MlasFlashAttentionThreadedArgs* args,
MLAS_THREADPOOL* ThreadPool
);

#if defined(USE_KLEIDIAI)
/**
* @brief Function to override the packing mechanism decision if kleidi ai is included
* @param enable enable kleidiai packing (allow or disallow depending on true/false)
* @return
*/
void
MLASCALL
MlasGemmBatchPackUseKleidi(bool enable);
#endif
28 changes: 28 additions & 0 deletions onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
#include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa.h"
#if defined(ENABLE_QMX_KERNELS)
#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa.h"
#endif // ENABLE_QMX_KERNELS

const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod =
{kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
Expand Down Expand Up @@ -122,6 +125,21 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_sme2 =
kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa,
kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa};

#if defined(ENABLE_QMX_KERNELS)
const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_qmx =
{kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_mr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_nr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_kr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_dst_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa};
#endif // ENABLE_QMX_KERNELS

const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemmUKernel() {
if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeon_I8MM()) {
return kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm;
Expand All @@ -142,7 +160,17 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel& GetKleidiAISGemmUKernel() {
if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2()) {
return sgemm_gemm_sme2;
} else {
#if defined(ENABLE_QMX_KERNELS)
if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
{
KLEIDIAI_KERNEL_LOG("SGEMM: Using QMX Kernel");
return sgemm_gemm_qmx;
} else {
return sgemm_gemm_sme;
}
#else
return sgemm_gemm_sme;
#endif // ENABLE_QMX_KERNELS
}
}

Expand Down
32 changes: 27 additions & 5 deletions onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
#include "kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa.h"
#include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.h"
#include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.h"
#if defined(ENABLE_QMX_KERNELS)
#include "kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_qmx_mopa.h"
#endif // ENABLE_QMX_KERNELS


// Right-hand-side (weights) cache key
struct RhsCacheKey {
Expand Down Expand Up @@ -596,11 +600,29 @@ static void ConvolveSme(const size_t co, //channels out
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
} else {
KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci);
kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(
TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
#if defined(ENABLE_QMX_KERNELS)
if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
{
KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_qmx_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci);
kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_qmx_mopa(
TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
else {
KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci);
kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(
TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
#else
KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci);
kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(
TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
#endif // ENABLE_QMX_KERNELS
}
});

Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ namespace ArmKleidiAI {
// By default we should try for SME2 first before falling back to SME.
inline const bool UseSME2 = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2();
inline const bool UseSME = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME();
inline const std::string_view vendor_name = MLAS_CPUIDINFO::GetCPUIDInfo().GetCPUVendor();

// Buffer packing routines.
//
Expand Down
40 changes: 33 additions & 7 deletions onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.h"
#if defined(ENABLE_QMX_KERNELS)
#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa.h"
#endif // ENABLE_QMX_KERNELS

#include "mlasi_kleidiai.h"

Expand Down Expand Up @@ -247,13 +250,36 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
);
}
else {
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(
TileSizeM, TileSizeN, Shape.K, ATile, BTile,
dst_tile,
DataParams[BIdx].ldc * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
#if defined(ENABLE_QMX_KERNELS)
if(ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
{
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa(
TileSizeM, TileSizeN, Shape.K, ATile, BTile,
dst_tile,
DataParams[BIdx].ldc * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
else
{
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(
TileSizeM, TileSizeN, Shape.K, ATile, BTile,
dst_tile,
DataParams[BIdx].ldc * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
#else
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(
TileSizeM, TileSizeN, Shape.K, ATile, BTile,
dst_tile,
DataParams[BIdx].ldc * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
#endif // ENABLE_QMX_KERNELS
}
});
}
13 changes: 8 additions & 5 deletions onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#include "mlasi_kleidiai.h"
#include "kai_ukernel_interface.h"

#if defined(ENABLE_QMX_KERNELS)
#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa.h"
#endif // ENABLE_QMX_KERNELS

// Thread-local reusable buffers to reduce allocation overhead across tiles.
struct KaiTlsBuffers {
Expand Down Expand Up @@ -145,9 +148,9 @@ ArmKleidiAI::MlasGemvBatch(
if (M != 1 && N != 1) {
return false;
}

const bool m_path = (M == 1);

// We cannot support cases where N == 1 and B is already packed.
// When both are 1, we route through the M-path, so this naturally doesn't trigger.
if (!m_path && Data->BIsPacked) {
Expand All @@ -165,15 +168,15 @@ ArmKleidiAI::MlasGemvBatch(
// - M-path: LHS is A, stride = lda
// - N-path: LHS is B, stride = ldb
size_t lhs_ld = m_path ? Data[b].lda : Data[b].ldb;

const float* rhs_base = m_path ? static_cast<const float*>(Data[b].B)
: static_cast<const float*>(Data[b].A);
const float* lhs_base = m_path ? static_cast<const float*>(Data[b].A)
const float* lhs_base = m_path ? static_cast<const float*>(Data[b].A)
: static_cast<const float*>(Data[b].B);

// Prepare packed RHS if needed
const void* rhs_packed_ptr = nullptr;

// The if branch can only be taken in cases where we are dealing with M == 1
// We previously reject any prepacked B where N == 1
// In cases where N == 1 we Pack A Matrix as the RHS using tb = CBlasTrans
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/mlas/lib/qgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,7 +206,7 @@ MLASCALL
MlasIsDynamicQGemmAvailable()
{
#if defined(USE_KLEIDIAI)
return (ArmKleidiAI::UseSME || ArmKleidiAI::UseSME2);
return (ArmKleidiAI::UseSME2 || ArmKleidiAI::UseSME);
#else
return false;
#endif
Expand Down
2 changes: 2 additions & 0 deletions tools/ci_build/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -896,6 +896,8 @@ def generate_build_tree(

if not args.no_kleidiai:
cmake_args += ["-Donnxruntime_USE_KLEIDIAI=ON"]
if args.use_qmx:
cmake_args += ["-Donnxruntime_USE_QMX_KLEIDIAI_COEXIST=ON"]

if args.enable_arm_neon_nchwc:
cmake_args += ["-Donnxruntime_USE_ARM_NEON_NCHWC=ON"]
Expand Down
6 changes: 6 additions & 0 deletions tools/ci_build/build_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -763,6 +763,12 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None:
"--no_kleidiai", action="store_true", help="Disable KleidiAI integration (used with ACL/ArmNN)."
)

# --- Qualcomm QMX Library ---
qmx_group = parser.add_argument_group("QMX kernel library")
qmx_group.add_argument(
"--use_qmx", action="store_true", help="Enable Qualcomm QMX kernel to coexist with Arm KleidiAI."
)

# --- RKNPU ---
rknpu_group = parser.add_argument_group("RKNPU Execution Provider")
rknpu_group.add_argument("--use_rknpu", action="store_true", help="Enable RKNPU EP.")
Expand Down
Loading