diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index cd939acc5aeae..6d0d39556e1c0 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -91,6 +91,7 @@ option(onnxruntime_USE_SVE "Build with SVE support in MLAS" OFF) option(onnxruntime_USE_ARM_NEON_NCHWC "Build with ARM Neon NCHWc kernels in MLAS" OFF) option(onnxruntime_USE_KLEIDIAI "Build with KleidiAI integration in MLAS" OFF) +option(onnxruntime_USE_QMX_KLEIDIAI_COEXIST "Build with QMX and Arm KLEIDIAI libraries" OFF) option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON) option(onnxruntime_BUILD_CSHARP "Build C# library" OFF) option(onnxruntime_BUILD_OBJC "Build Objective-C library" OFF) diff --git a/cmake/deps.txt b/cmake/deps.txt index 38c80f87095c8..578dd8fd23d09 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -57,4 +57,7 @@ directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.12.0.zip;7e733cfdc410d777b76122d64232499205589a96 dawn;https://github.com/google/dawn/archive/13c1635a14574ebb7116b56a69f5519301417fda.zip;0aadd28fc385cf7d657d5fc70a352372d2d3c76a kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.20.0.tar.gz;6895e72b3d5cf1173358164cb3d64c9d7d33cc84 +# kleidiai-qmx is pinned to a specific commit as there are no tagged releases. When an appropriate tagged release becomes available, +# this entry will be updated to use refs/tags/ instead of the raw commit hash. +kleidiai-qmx;https://github.com/qualcomm/kleidiai/archive/2f10c9a8d32f81ffeeb6d4885a29cc35d2b0da87.zip;5e855730a2d69057a569f43dd7532db3b2d2a05c duktape;https://github.com/svaarala/duktape/releases/download/v2.7.0/duktape-2.7.0.tar.xz;8200c8e417dbab7adcc12c4dbdef7651cfc55794 diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 3c616684fb296..9feb7772d1e88 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -845,6 +845,12 @@ if(onnxruntime_USE_KLEIDIAI) onnxruntime_fetchcontent_declare(kleidiai URL ${DEP_URL_kleidiai} URL_HASH SHA1=${DEP_SHA1_kleidiai} EXCLUDE_FROM_ALL) onnxruntime_fetchcontent_makeavailable(kleidiai) + # Fetch Qualcomm's kleidiai library + if(onnxruntime_USE_QMX_KLEIDIAI_COEXIST) + onnxruntime_fetchcontent_declare(kleidiai-qmx URL ${DEP_URL_kleidiai-qmx} URL_HASH SHA1=${DEP_SHA1_kleidiai-qmx} + EXCLUDE_FROM_ALL) + onnxruntime_fetchcontent_makeavailable(kleidiai-qmx) + endif() endif() set(onnxruntime_LINK_DIRS) diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 9cbeb161f4c7e..7653948be1609 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -289,6 +289,11 @@ function(setup_kleidiai) ) target_link_libraries(onnxruntime_mlas PRIVATE kleidiai) list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai) + if(onnxruntime_USE_QMX_KLEIDIAI_COEXIST) + target_link_libraries(onnxruntime_mlas PRIVATE kleidiai-qmx) + target_compile_definitions(onnxruntime_mlas PRIVATE ENABLE_QMX_KERNELS=1) + list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai-qmx) + endif() set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES} PARENT_SCOPE) # If KLEIDIAI_DEBUG is enabled that implies both DEBUG and KERNEL messages. @@ -307,6 +312,14 @@ function(setup_kleidiai) RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() + + if(onnxruntime_USE_QMX_KLEIDIAI_COEXIST) + install(TARGETS kleidiai-qmx EXPORT ${PROJECT_NAME}Targets + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR}) + endif() endfunction() function (setup_arm_neon_nchwc) diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index be9c997a93ba9..0b6eda1ea95cb 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -2125,14 +2125,3 @@ MlasFlashAttention( MlasFlashAttentionThreadedArgs* args, MLAS_THREADPOOL* ThreadPool ); - -#if defined(USE_KLEIDIAI) -/** - * @brief Function to override the packing mechanism decision if kleidi ai is included - * @param enable enable kleidiai packing (allow or disallow depending on true/false) - * @return -*/ -void -MLASCALL -MlasGemmBatchPackUseKleidi(bool enable); -#endif diff --git a/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp b/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp index 87184bf8bb3cf..a406f371a3bd2 100644 --- a/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp +++ b/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp @@ -19,6 +19,9 @@ #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h" #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa.h" #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa.h" +#if defined(ENABLE_QMX_KERNELS) +#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa.h" +#endif // ENABLE_QMX_KERNELS const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod = {kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod, @@ -122,6 +125,21 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_sme2 = kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa, kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa}; +#if defined(ENABLE_QMX_KERNELS) +const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_qmx = + {kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa, + kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa, + kai_get_mr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa, + kai_get_nr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa, + kai_get_kr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa, + kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa, + kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa, + kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa, + kai_get_dst_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa, + kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa, + kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa}; +#endif // ENABLE_QMX_KERNELS + const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemmUKernel() { if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeon_I8MM()) { return kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm; @@ -142,7 +160,17 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel& GetKleidiAISGemmUKernel() { if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2()) { return sgemm_gemm_sme2; } else { +#if defined(ENABLE_QMX_KERNELS) + if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0) + { + KLEIDIAI_KERNEL_LOG("SGEMM: Using QMX Kernel"); + return sgemm_gemm_qmx; + } else { + return sgemm_gemm_sme; + } +#else return sgemm_gemm_sme; +#endif // ENABLE_QMX_KERNELS } } diff --git a/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp index 487e1533f5967..94332c9ed34bc 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp @@ -16,6 +16,10 @@ #include "kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa.h" #include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.h" #include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.h" +#if defined(ENABLE_QMX_KERNELS) +#include "kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_qmx_mopa.h" +#endif // ENABLE_QMX_KERNELS + // Right-hand-side (weights) cache key struct RhsCacheKey { @@ -596,11 +600,29 @@ static void ConvolveSme(const size_t co, //channels out -std::numeric_limits::max(), std::numeric_limits::max() ); } else { - KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci); - kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa( - TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float), - -std::numeric_limits::max(), std::numeric_limits::max() - ); + #if defined(ENABLE_QMX_KERNELS) + if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0) + { + KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_qmx_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci); + kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_qmx_mopa( + TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float), + -std::numeric_limits::max(), std::numeric_limits::max() + ); + } + else { + KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci); + kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa( + TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float), + -std::numeric_limits::max(), std::numeric_limits::max() + ); + } + #else + KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci); + kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa( + TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float), + -std::numeric_limits::max(), std::numeric_limits::max() + ); + #endif // ENABLE_QMX_KERNELS } }); diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h index d652989a610e0..d4df09bb94a93 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h +++ b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h @@ -54,6 +54,7 @@ namespace ArmKleidiAI { // By default we should try for SME2 first before falling back to SME. inline const bool UseSME2 = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2(); inline const bool UseSME = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME(); +inline const std::string_view vendor_name = MLAS_CPUIDINFO::GetCPUIDInfo().GetCPUVendor(); // Buffer packing routines. // diff --git a/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp index 2a63dc80316f5..9b2deac69ff63 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp @@ -13,6 +13,9 @@ #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa.h" #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.h" +#if defined(ENABLE_QMX_KERNELS) +#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa.h" +#endif // ENABLE_QMX_KERNELS #include "mlasi_kleidiai.h" @@ -247,13 +250,36 @@ ArmKleidiAI::MlasDynamicQGemmBatch( ); } else { - kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa( - TileSizeM, TileSizeN, Shape.K, ATile, BTile, - dst_tile, - DataParams[BIdx].ldc * sizeof(float), - sizeof(float), - -std::numeric_limits::max(), std::numeric_limits::max() - ); + #if defined(ENABLE_QMX_KERNELS) + if(ArmKleidiAI::vendor_name.compare("Qualcomm") == 0) + { + kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa( + TileSizeM, TileSizeN, Shape.K, ATile, BTile, + dst_tile, + DataParams[BIdx].ldc * sizeof(float), + sizeof(float), + -std::numeric_limits::max(), std::numeric_limits::max() + ); + } + else + { + kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa( + TileSizeM, TileSizeN, Shape.K, ATile, BTile, + dst_tile, + DataParams[BIdx].ldc * sizeof(float), + sizeof(float), + -std::numeric_limits::max(), std::numeric_limits::max() + ); + } + #else + kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa( + TileSizeM, TileSizeN, Shape.K, ATile, BTile, + dst_tile, + DataParams[BIdx].ldc * sizeof(float), + sizeof(float), + -std::numeric_limits::max(), std::numeric_limits::max() + ); + #endif // ENABLE_QMX_KERNELS } }); } diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp index 250b5d076475d..618d52c7af661 100644 --- a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp +++ b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp @@ -16,6 +16,9 @@ #include "mlasi_kleidiai.h" #include "kai_ukernel_interface.h" +#if defined(ENABLE_QMX_KERNELS) +#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa.h" +#endif // ENABLE_QMX_KERNELS // Thread-local reusable buffers to reduce allocation overhead across tiles. struct KaiTlsBuffers { @@ -145,9 +148,9 @@ ArmKleidiAI::MlasGemvBatch( if (M != 1 && N != 1) { return false; } - + const bool m_path = (M == 1); - + // We cannot support cases where N == 1 and B is already packed. // When both are 1, we route through the M-path, so this naturally doesn't trigger. if (!m_path && Data->BIsPacked) { @@ -165,15 +168,15 @@ ArmKleidiAI::MlasGemvBatch( // - M-path: LHS is A, stride = lda // - N-path: LHS is B, stride = ldb size_t lhs_ld = m_path ? Data[b].lda : Data[b].ldb; - + const float* rhs_base = m_path ? static_cast(Data[b].B) : static_cast(Data[b].A); - const float* lhs_base = m_path ? static_cast(Data[b].A) + const float* lhs_base = m_path ? static_cast(Data[b].A) : static_cast(Data[b].B); // Prepare packed RHS if needed const void* rhs_packed_ptr = nullptr; - + // The if branch can only be taken in cases where we are dealing with M == 1 // We previously reject any prepacked B where N == 1 // In cases where N == 1 we Pack A Matrix as the RHS using tb = CBlasTrans diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp index 1678a7e6e4486..3fc69a607de3f 100644 --- a/onnxruntime/core/mlas/lib/qgemm.cpp +++ b/onnxruntime/core/mlas/lib/qgemm.cpp @@ -206,7 +206,7 @@ MLASCALL MlasIsDynamicQGemmAvailable() { #if defined(USE_KLEIDIAI) - return (ArmKleidiAI::UseSME || ArmKleidiAI::UseSME2); + return (ArmKleidiAI::UseSME2 || ArmKleidiAI::UseSME); #else return false; #endif diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index e7e5cbe5ea031..a0712af35e455 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -896,6 +896,8 @@ def generate_build_tree( if not args.no_kleidiai: cmake_args += ["-Donnxruntime_USE_KLEIDIAI=ON"] + if args.use_qmx: + cmake_args += ["-Donnxruntime_USE_QMX_KLEIDIAI_COEXIST=ON"] if args.enable_arm_neon_nchwc: cmake_args += ["-Donnxruntime_USE_ARM_NEON_NCHWC=ON"] diff --git a/tools/ci_build/build_args.py b/tools/ci_build/build_args.py index 33d6c39de1aad..f21a0d6350299 100644 --- a/tools/ci_build/build_args.py +++ b/tools/ci_build/build_args.py @@ -763,6 +763,12 @@ def add_execution_provider_args(parser: argparse.ArgumentParser) -> None: "--no_kleidiai", action="store_true", help="Disable KleidiAI integration (used with ACL/ArmNN)." ) + # --- Qualcomm QMX Library --- + qmx_group = parser.add_argument_group("QMX kernel library") + qmx_group.add_argument( + "--use_qmx", action="store_true", help="Enable Qualcomm QMX kernel to coexist with Arm KleidiAI." + ) + # --- RKNPU --- rknpu_group = parser.add_argument_group("RKNPU Execution Provider") rknpu_group.add_argument("--use_rknpu", action="store_true", help="Enable RKNPU EP.")