microsoft · hariharans29 · Jan 17, 2026 · Dec 22, 2025 · Jan 12, 2026 · Jan 12, 2026
diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -57,4 +57,5 @@ directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v
 cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.12.0.zip;7e733cfdc410d777b76122d64232499205589a96
 dawn;https://github.com/google/dawn/archive/13c1635a14574ebb7116b56a69f5519301417fda.zip;0aadd28fc385cf7d657d5fc70a352372d2d3c76a
 kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.15.0.tar.gz;62ccd24ab60bcef68766440fb42d79071ac2a5d2
+kleidiai-qmx;https://github.com/qualcomm/kleidiai/archive/85927d96456a18b9fd3b7d0f17bc2f3e0163f429.zip;255a54e4ef9d498c9c6987582f69032e031b38cf
 duktape;https://github.com/svaarala/duktape/releases/download/v2.7.0/duktape-2.7.0.tar.xz;8200c8e417dbab7adcc12c4dbdef7651cfc55794
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
@@ -845,6 +845,12 @@ if(onnxruntime_USE_KLEIDIAI)
 
   onnxruntime_fetchcontent_declare(kleidiai URL ${DEP_URL_kleidiai} URL_HASH SHA1=${DEP_SHA1_kleidiai} EXCLUDE_FROM_ALL)
   onnxruntime_fetchcontent_makeavailable(kleidiai)
+  # Fetch Qualcomm's kleidiai library
+  if(ENABLE_KLEIDIAI_QMX_COEXIST)
+          onnxruntime_fetchcontent_declare(kleidiai-qmx URL ${DEP_URL_kleidiai-qmx} URL_HASH SHA1=${DEP_SHA1_kleidiai-qmx}
+                  EXCLUDE_FROM_ALL)
+          onnxruntime_fetchcontent_makeavailable(kleidiai-qmx)
+  endif()
 endif()
 
 set(onnxruntime_LINK_DIRS)

diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -284,6 +284,11 @@ function(setup_kleidiai)
   )
   target_link_libraries(onnxruntime_mlas PRIVATE kleidiai)
   list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai)
+  if(ENABLE_KLEIDIAI_QMX_COEXIST)
+          target_link_libraries(onnxruntime_mlas PRIVATE  kleidiai-qmx)
+          target_compile_definitions(onnxruntime_mlas PRIVATE ENABLE_QMX_KERNELS=1)
+          list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai-qmx)
+  endif()
   set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES} PARENT_SCOPE)
 
   # If KLEIDIAI_DEBUG is enabled that implies both DEBUG and KERNEL messages.
@@ -302,6 +307,14 @@ function(setup_kleidiai)
     RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
     FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
   endif()
+
+  if(MLAS_ENABLE_KLEIDIAI_QMX_COEXIST)
+    install(TARGETS kleidiai-qmx EXPORT ${PROJECT_NAME}Targets
+    ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
+    RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
+    FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
+  endif()
 endfunction()
 
 function (setup_arm_neon_nchwc)

diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
@@ -163,7 +163,7 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
 
   Status Compute(OpKernelContext* context) const override;
 
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
   Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
                  /*out*/ bool& is_packed,
                  /*out*/ PrePackedWeights* prepacked_weights) override {
@@ -307,7 +307,7 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
  private:
   // Indicates when MlasDynamicQGemmBatch() can be used
   bool can_use_dynamic_quant_mlas_{false};
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
   // Indicates that the biases are a constant input and thus already quantized / packed
   bool dynamic_quant_mlas_bias_data_was_packed_{false};
 #endif
@@ -382,7 +382,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
   }
   // Guard against KleidiAI functions being called in non kleidi builds
   // TODO: migrate to a suitable override function call for kleidi dynamic qgemm function calls
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
   else {
     MatMulComputeHelper helper;
     ORT_RETURN_IF_ERROR(helper.Compute(ctx->Input<Tensor>(IN_A)->Shape(),

diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
@@ -2116,7 +2116,7 @@ MlasFlashAttention(
     MLAS_THREADPOOL* ThreadPool
 );
 
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
 /**
  * @brief Function to override the packing mechanism decision if kleidi ai is included
  * @param enable     enable kleidiai packing (allow or disallow depending on true/false)

diff --git a/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp b/onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp
@@ -19,6 +19,9 @@
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa.h"
+#if(ENABLE_QMX_KERNELS)
+#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa.h"
+#endif // ENABLE_QMX_KERNELS
 
 const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod =
     {kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
@@ -122,6 +125,21 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_sme2 =
     kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa,
     kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa};
 
+#if(ENABLE_QMX_KERNELS)
+const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_qmx =
+    {kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
+    kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
+    kai_get_mr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
+    kai_get_nr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
+    kai_get_kr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
+    kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
+    kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
+    kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
+    kai_get_dst_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
+    kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
+    kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa};
+#endif // ENABLE_QMX_KERNELS
+
 const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemmUKernel() {
     if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeon_I8MM()) {
         return kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm;
@@ -142,7 +160,18 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel& GetKleidiAISGemmUKernel() {
     if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2()) {
         return sgemm_gemm_sme2;
     } else {
+        #if(ENABLE_QMX_KERNELS)
+            if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
+            {
+                KLEIDIAI_KERNEL_LOG("SGEMM: Using QMX Kernel");
+                return sgemm_gemm_qmx;
+
+            } else {
+                return sgemm_gemm_sme;
+            }
+        #else
         return sgemm_gemm_sme;
+        #endif // ENABLE_QMX_KERNELS
     }
 }
 

diff --git a/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp
@@ -16,6 +16,10 @@
 #include "kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa.h"
 #include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.h"
 #include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.h"
+#if(ENABLE_QMX_KERNELS)
+#include "kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_qmx_mopa.h"
+#endif // ENABLE_QMX_KERNELS
+
 
 // Right-hand-side (weights) cache key
 struct RhsCacheKey {
@@ -597,10 +601,29 @@ static void ConvolveSme(const size_t co, //channels out
                 );
             } else {
                 KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci);
-                kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(
-                    TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
-                    -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
-                );
+#if(ENABLE_QMX_KERNELS)
+                    if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
+                    {
+                        KLEIDIAI_KERNEL_LOG("Convolve: Using QMX Kernel");
+                        kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_qmx_mopa(
+                                    TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
+                                    -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
+                                );
+                    }
+                    else {
+                                kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(
+                                    TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
+                                    -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
+                                );
+                    }
+#else
+
+                    kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(
+                                                        TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
+                                                        -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
+                                                    );
+#endif // ENABLE_QMX_KERNELS
+
             }
         });
 

diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
@@ -53,6 +53,8 @@ namespace ArmKleidiAI {
 
 // By default we should try for SME2 first before falling back to SME.
 inline const bool UseSME2 = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2();
+inline const bool UseSME = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME();
+inline const std::string_view vendor_name = MLAS_CPUIDINFO::GetCPUIDInfo().GetCPUVendor();
 
 // Buffer packing routines.
 //

diff --git a/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp
@@ -11,7 +11,11 @@
 #include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.h"
 
 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa.h"
 #include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.h"
+#if(ENABLE_QMX_KERNELS)
+#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa.h"
+#endif // ENABLE_QMX_KERNELS
 
 #include "mlasi_kleidiai.h"
 
@@ -108,14 +112,53 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
                             << " M="<< Shape.M << " K=" << Shape.K << " mr=" << mr << " kr=" << kr << " sr=" << sr << " m_idx_start=0");
         kai_run_lhs_quant_pack_qai8dxp_f32(Shape.M, Shape.K, mr, kr, sr, 0, DataParams->A,
                                            Shape.K*sizeof(float), lhs);
+        if (ArmKleidiAI::UseSME2)
+        {
+            KLEIDIAI_KERNEL_LOG("kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa");
+            kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(
+                Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
+                DataParams->C,
+                Shape.N * sizeof(float),
+                sizeof(float),
+                -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
+            );
+        }
+        else {
+
+                #if(ENABLE_QMX_KERNELS)
+                    if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
+                    {
+                        KLEIDIAI_KERNEL_LOG("QGEMM: Using QMX Kernel");
+                        kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa(
+                            Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
+                            DataParams->C,
+                            Shape.N * sizeof(float),
+                            sizeof(float),
+                            -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
+                        );
+
+                    }
+                    else {
+                        KLEIDIAI_KERNEL_LOG("kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa");
+                        kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(
+                            Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
+                            DataParams->C,
+                            Shape.N * sizeof(float),
+                            sizeof(float),
+                            -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
+                        );
+                    }
+                #else
+                        KLEIDIAI_KERNEL_LOG("kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa");
+                                kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(
+                                    Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
+                                    DataParams->C,
+                                    Shape.N * sizeof(float),
+                                    sizeof(float),
+                                    -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
+                                );
+                #endif // ENABLE_QMX_KERNELS
 
-        KLEIDIAI_KERNEL_LOG("kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa");
-        kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(
-            Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
-            DataParams->C,
-            Shape.N * sizeof(float),
-            sizeof(float),
-            -std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
-        );
+        }
     }
 }
diff --git a/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp b/onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
@@ -16,6 +16,9 @@
 #include "mlasi_kleidiai.h"
 #include "kai_ukernel_interface.h"
 
+#if(ENABLE_QMX_KERNELS)
+#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa.h"
+#endif // ENABLE_QMX_KERNELS
 
 // Thread-local reusable buffers to reduce allocation overhead across tiles.
 struct KaiTlsBuffers {
@@ -145,9 +148,9 @@ ArmKleidiAI::MlasGemvBatch(
         if (M != 1 && N != 1) {
             return false;
         }
-    
+
         const bool m_path = (M == 1);
-    
+
         // We cannot support cases where N == 1 and B is already packed.
         // When both are 1, we route through the M-path, so this naturally doesn't trigger.
         if (!m_path && Data->BIsPacked) {
@@ -165,15 +168,15 @@ ArmKleidiAI::MlasGemvBatch(
             // - M-path: LHS is A, stride = lda
             // - N-path: LHS is B, stride = ldb
             size_t lhs_ld = m_path ? Data[b].lda : Data[b].ldb;
-            
+
             const float* rhs_base = m_path ? static_cast<const float*>(Data[b].B)
                                            : static_cast<const float*>(Data[b].A);
-            const float* lhs_base = m_path ? static_cast<const float*>(Data[b].A) 
+            const float* lhs_base = m_path ? static_cast<const float*>(Data[b].A)
                                            : static_cast<const float*>(Data[b].B);
 
             // Prepare packed RHS if needed
             const void* rhs_packed_ptr = nullptr;
-            
+
             // The if branch can only be taken in cases where we are dealing with M == 1
             // We previously reject any prepacked B where N == 1
             // In cases where N == 1 we Pack A Matrix as the RHS using tb = CBlasTrans

diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
@@ -19,7 +19,7 @@ Module Name:
 #ifdef MLAS_USE_SVE
 #include "sve/mlasi_sve.h"
 #endif
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
 #include "kleidiai/mlasi_kleidiai.h"
 #endif
 
@@ -600,7 +600,7 @@ Return Value:
         this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot;
     }
 
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
     if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){
         this->MlasGemmBatchOverride = ArmKleidiAI::MlasGemmBatch;
         this->MlasGemmPackBSizeOverride = ArmKleidiAI::MlasGemmPackBSize;

diff --git a/onnxruntime/core/mlas/lib/qgemm.cpp b/onnxruntime/core/mlas/lib/qgemm.cpp
@@ -19,7 +19,7 @@ Module Name:
 #include "qgemm.h"
 
 // TODO: When overrides are implemented, remove this
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
 #include "kleidiai/mlasi_kleidiai.h"
 #endif
 
@@ -205,8 +205,14 @@ bool
 MLASCALL
 MlasIsDynamicQGemmAvailable()
 {
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
-  return ArmKleidiAI::UseSME2;
+#if defined(USE_KLEIDIAI)
+  if(ArmKleidiAI::UseSME2) {
+    return ArmKleidiAI::UseSME2;
+  }
+  else {
+    return ArmKleidiAI::UseSME;
+  }
+
 #else
   return false;
 #endif
@@ -222,7 +228,7 @@ MlasDynamicQGemmBatch (
 ) {
     assert(MlasIsDynamicQGemmAvailable());
 
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
     //No fallback
     ArmKleidiAI::MlasDynamicQGemmBatch(Shape, DataParams, BatchN, ThreadPool);
 #endif
@@ -346,7 +352,7 @@ MlasDynamicQgemmPackBSize(
     assert(MlasIsDynamicQGemmAvailable());
 
     size_t bytes = 0;
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
     //No fallback available
     //TODO: Insert Override
     bytes = ArmKleidiAI::MlasDynamicQgemmPackBSize(N, K);
@@ -440,7 +446,7 @@ MlasDynamicQgemmPackB(
 {
     assert(MlasIsDynamicQGemmAvailable());
 
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
     //No fallback
     ArmKleidiAI::MlasDynamicQgemmPackB(N, K, B, Scales, Bias, PackedB);
 #endif

diff --git a/onnxruntime/core/mlas/lib/sgemm.cpp b/onnxruntime/core/mlas/lib/sgemm.cpp
@@ -1670,7 +1670,7 @@ Return Value:
     // Compute the number of bytes required to hold the packed buffer.
     //
     // KleidiAI or other override
-    #if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+    #if defined(USE_KLEIDIAI)
     if (GetMlasPlatform().MlasGemmPackBSizeOverride != nullptr &&
         // TODO: Remove once KAI supports transposing for A
         TransA != CBLAS_TRANSPOSE::CblasTrans) {
@@ -1737,7 +1737,7 @@ Return Value:
 
 --*/
 {
-#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+#if defined(USE_KLEIDIAI)
     if (GetMlasPlatform().MlasGemmPackBOverride != nullptr  &&
         // TODO: Remove once KAI supports transposing for A
         TransA != CBLAS_TRANSPOSE::CblasTrans    &&