Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmake/deps.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,5 @@ directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v
cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.12.0.zip;7e733cfdc410d777b76122d64232499205589a96
dawn;https://github.com/google/dawn/archive/13c1635a14574ebb7116b56a69f5519301417fda.zip;0aadd28fc385cf7d657d5fc70a352372d2d3c76a
kleidiai;https://github.com/ARM-software/kleidiai/archive/refs/tags/v1.15.0.tar.gz;62ccd24ab60bcef68766440fb42d79071ac2a5d2
kleidiai-qmx;https://github.com/qualcomm/kleidiai/archive/85927d96456a18b9fd3b7d0f17bc2f3e0163f429.zip;255a54e4ef9d498c9c6987582f69032e031b38cf
duktape;https://github.com/svaarala/duktape/releases/download/v2.7.0/duktape-2.7.0.tar.xz;8200c8e417dbab7adcc12c4dbdef7651cfc55794
6 changes: 6 additions & 0 deletions cmake/external/onnxruntime_external_deps.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -845,6 +845,12 @@ if(onnxruntime_USE_KLEIDIAI)

onnxruntime_fetchcontent_declare(kleidiai URL ${DEP_URL_kleidiai} URL_HASH SHA1=${DEP_SHA1_kleidiai} EXCLUDE_FROM_ALL)
onnxruntime_fetchcontent_makeavailable(kleidiai)
# Fetch Qualcomm's kleidiai library
if(ENABLE_KLEIDIAI_QMX_COEXIST)
onnxruntime_fetchcontent_declare(kleidiai-qmx URL ${DEP_URL_kleidiai-qmx} URL_HASH SHA1=${DEP_SHA1_kleidiai-qmx}
EXCLUDE_FROM_ALL)
onnxruntime_fetchcontent_makeavailable(kleidiai-qmx)
endif()
endif()

set(onnxruntime_LINK_DIRS)
Expand Down
13 changes: 13 additions & 0 deletions cmake/onnxruntime_mlas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,11 @@ function(setup_kleidiai)
)
target_link_libraries(onnxruntime_mlas PRIVATE kleidiai)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai)
if(ENABLE_KLEIDIAI_QMX_COEXIST)
target_link_libraries(onnxruntime_mlas PRIVATE kleidiai-qmx)
target_compile_definitions(onnxruntime_mlas PRIVATE ENABLE_QMX_KERNELS=1)
list(APPEND onnxruntime_EXTERNAL_LIBRARIES kleidiai-qmx)
endif()
set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES} PARENT_SCOPE)

# If KLEIDIAI_DEBUG is enabled that implies both DEBUG and KERNEL messages.
Expand All @@ -302,6 +307,14 @@ function(setup_kleidiai)
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()

if(MLAS_ENABLE_KLEIDIAI_QMX_COEXIST)
install(TARGETS kleidiai-qmx EXPORT ${PROJECT_NAME}Targets
ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}
LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
endif()
endfunction()

function (setup_arm_neon_nchwc)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {

Status Compute(OpKernelContext* context) const override;

#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
#if defined(USE_KLEIDIAI)
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
/*out*/ bool& is_packed,
/*out*/ PrePackedWeights* prepacked_weights) override {
Expand Down Expand Up @@ -307,7 +307,7 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
private:
// Indicates when MlasDynamicQGemmBatch() can be used
bool can_use_dynamic_quant_mlas_{false};
#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
#if defined(USE_KLEIDIAI)
// Indicates that the biases are a constant input and thus already quantized / packed
bool dynamic_quant_mlas_bias_data_was_packed_{false};
#endif
Expand Down Expand Up @@ -382,7 +382,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
}
// Guard against KleidiAI functions being called in non kleidi builds
// TODO: migrate to a suitable override function call for kleidi dynamic qgemm function calls
#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
#if defined(USE_KLEIDIAI)
else {
MatMulComputeHelper helper;
ORT_RETURN_IF_ERROR(helper.Compute(ctx->Input<Tensor>(IN_A)->Shape(),
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/mlas/inc/mlas.h
Original file line number Diff line number Diff line change
Expand Up @@ -2116,7 +2116,7 @@ MlasFlashAttention(
MLAS_THREADPOOL* ThreadPool
);

#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
#if defined(USE_KLEIDIAI)
/**
* @brief Function to override the packing mechanism decision if kleidi ai is included
* @param enable enable kleidiai packing (allow or disallow depending on true/false)
Expand Down
29 changes: 29 additions & 0 deletions onnxruntime/core/mlas/lib/kai_ukernel_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
#include "kai/ukernels/matmul/matmul_clamp_f32_f32_f32p/kai_matmul_clamp_f32_f32_f32p8x1biasf32_6x8x4_neon_mla.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa.h"
#if(ENABLE_QMX_KERNELS)
#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa.h"
#endif // ENABLE_QMX_KERNELS

const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel kai_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod =
{kai_get_m_step_matmul_clamp_f32_qai8dxp1x4_qsi4c32p4x4_1x4_neon_dotprod,
Expand Down Expand Up @@ -122,6 +125,21 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_sme2 =
kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa,
kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_sme2_mopa};

#if(ENABLE_QMX_KERNELS)
const kai_matmul_clamp_f32_f32p_f32p_ukernel sgemm_gemm_qmx =
{kai_get_m_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_n_step_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_mr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_nr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_kr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_sr_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_lhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_rhs_packed_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_dst_offset_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_get_dst_size_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa,
kai_run_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa};
#endif // ENABLE_QMX_KERNELS

const kai_matmul_clamp_f32_qai8dxp_qsi4c32p_ukernel& GetKleidiAIGemmUKernel() {
if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeon_I8MM()) {
return kai_matmul_clamp_f32_qai8dxp4x8_qsi4c32p4x8_16x4x32_neon_i8mm;
Expand All @@ -142,7 +160,18 @@ const kai_matmul_clamp_f32_f32p_f32p_ukernel& GetKleidiAISGemmUKernel() {
if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2()) {
return sgemm_gemm_sme2;
} else {
#if(ENABLE_QMX_KERNELS)
if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
{
KLEIDIAI_KERNEL_LOG("SGEMM: Using QMX Kernel");
return sgemm_gemm_qmx;

} else {
return sgemm_gemm_sme;
}
#else
return sgemm_gemm_sme;
#endif // ENABLE_QMX_KERNELS
}
}

Expand Down
31 changes: 27 additions & 4 deletions onnxruntime/core/mlas/lib/kleidiai/convolve_kleidiai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@
#include "kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme2_mopa.h"
#include "kai/ukernels/matmul/pack/kai_lhs_imatmul_pack_x32p2vlx1_x32p_sme.h"
#include "kai/ukernels/matmul/pack/kai_rhs_imatmul_pack_kxn_x32p2vlx1b_x32_x32_sme.h"
#if(ENABLE_QMX_KERNELS)
#include "kai/ukernels/matmul/imatmul_clamp_f32_f32p_f32p/kai_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_qmx_mopa.h"
#endif // ENABLE_QMX_KERNELS


// Right-hand-side (weights) cache key
struct RhsCacheKey {
Expand Down Expand Up @@ -597,10 +601,29 @@ static void ConvolveSme(const size_t co, //channels out
);
} else {
KLEIDIAI_KERNEL_LOG("kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa" << " M=" << TileSizeM << " N=" << TileSizeN << " k_chunk_count=" << (d_kh * d_kw) << " k_chunk_length=" << ci);
kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(
TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
#if(ENABLE_QMX_KERNELS)
if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
{
KLEIDIAI_KERNEL_LOG("Convolve: Using QMX Kernel");
kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_qmx_mopa(
TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
else {
kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(
TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
#else

kai_run_imatmul_clamp_f32_f32p2vlx1_f32p2vlx1b_2vlx2vl_sme_mopa(
TileSizeM, TileSizeN, d_kh * d_kw, ci, ATile, BTile, CTile, co * sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
#endif // ENABLE_QMX_KERNELS

}
});

Expand Down
2 changes: 2 additions & 0 deletions onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ namespace ArmKleidiAI {

// By default we should try for SME2 first before falling back to SME.
inline const bool UseSME2 = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2();
inline const bool UseSME = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME();
inline const std::string_view vendor_name = MLAS_CPUIDINFO::GetCPUIDInfo().GetCPUVendor();

// Buffer packing routines.
//
Expand Down
59 changes: 51 additions & 8 deletions onnxruntime/core/mlas/lib/kleidiai/qgemm_kleidiai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,11 @@
#include "kai/ukernels/matmul/pack/kai_rhs_pack_kxn_qsi8cxp_qsi8cx_neon.h"

#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa.h"
#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1x4_qsi8cxp4vlx4_1x4vl_sme2_dot.h"
#if(ENABLE_QMX_KERNELS)
#include "kai/ukernels/matmul/matmul_clamp_f32_qai8dxp_qsi8cxp/kai_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa.h"
#endif // ENABLE_QMX_KERNELS

#include "mlasi_kleidiai.h"

Expand Down Expand Up @@ -108,14 +112,53 @@ ArmKleidiAI::MlasDynamicQGemmBatch(
<< " M="<< Shape.M << " K=" << Shape.K << " mr=" << mr << " kr=" << kr << " sr=" << sr << " m_idx_start=0");
kai_run_lhs_quant_pack_qai8dxp_f32(Shape.M, Shape.K, mr, kr, sr, 0, DataParams->A,
Shape.K*sizeof(float), lhs);
if (ArmKleidiAI::UseSME2)
{
KLEIDIAI_KERNEL_LOG("kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa");
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(
Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
DataParams->C,
Shape.N * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
else {

#if(ENABLE_QMX_KERNELS)
if (ArmKleidiAI::vendor_name.compare("Qualcomm") == 0)
{
KLEIDIAI_KERNEL_LOG("QGEMM: Using QMX Kernel");
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_qmx_mopa(
Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
DataParams->C,
Shape.N * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);

}
else {
KLEIDIAI_KERNEL_LOG("kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa");
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa(
Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
DataParams->C,
Shape.N * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
#else
KLEIDIAI_KERNEL_LOG("kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme_mopa");
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(
Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
DataParams->C,
Shape.N * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
#endif // ENABLE_QMX_KERNELS

KLEIDIAI_KERNEL_LOG("kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa");
kai_run_matmul_clamp_f32_qai8dxp1vlx4_qsi8cxp4vlx4_1vlx4vl_sme2_mopa(
Shape.M, Shape.N, Shape.K, lhs, DataParams->PackedB,
DataParams->C,
Shape.N * sizeof(float),
sizeof(float),
-std::numeric_limits<float>::max(), std::numeric_limits<float>::max()
);
}
}
}
13 changes: 8 additions & 5 deletions onnxruntime/core/mlas/lib/kleidiai/sgemm_kleidiai.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
#include "mlasi_kleidiai.h"
#include "kai_ukernel_interface.h"

#if(ENABLE_QMX_KERNELS)
#include "kai/ukernels/matmul/matmul_clamp_f32_f32p_f32p/kai_matmul_clamp_f32_f32p2vlx1_f32p2vlx1biasf32_qmx_mopa.h"
#endif // ENABLE_QMX_KERNELS

// Thread-local reusable buffers to reduce allocation overhead across tiles.
struct KaiTlsBuffers {
Expand Down Expand Up @@ -145,9 +148,9 @@ ArmKleidiAI::MlasGemvBatch(
if (M != 1 && N != 1) {
return false;
}

const bool m_path = (M == 1);

// We cannot support cases where N == 1 and B is already packed.
// When both are 1, we route through the M-path, so this naturally doesn't trigger.
if (!m_path && Data->BIsPacked) {
Expand All @@ -165,15 +168,15 @@ ArmKleidiAI::MlasGemvBatch(
// - M-path: LHS is A, stride = lda
// - N-path: LHS is B, stride = ldb
size_t lhs_ld = m_path ? Data[b].lda : Data[b].ldb;

const float* rhs_base = m_path ? static_cast<const float*>(Data[b].B)
: static_cast<const float*>(Data[b].A);
const float* lhs_base = m_path ? static_cast<const float*>(Data[b].A)
const float* lhs_base = m_path ? static_cast<const float*>(Data[b].A)
: static_cast<const float*>(Data[b].B);

// Prepare packed RHS if needed
const void* rhs_packed_ptr = nullptr;

// The if branch can only be taken in cases where we are dealing with M == 1
// We previously reject any prepacked B where N == 1
// In cases where N == 1 we Pack A Matrix as the RHS using tb = CBlasTrans
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/mlas/lib/platform.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Module Name:
#ifdef MLAS_USE_SVE
#include "sve/mlasi_sve.h"
#endif
#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
#if defined(USE_KLEIDIAI)
#include "kleidiai/mlasi_kleidiai.h"
#endif

Expand Down Expand Up @@ -600,7 +600,7 @@ Return Value:
this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot;
}

#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
#if defined(USE_KLEIDIAI)
if(MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME()){
this->MlasGemmBatchOverride = ArmKleidiAI::MlasGemmBatch;
this->MlasGemmPackBSizeOverride = ArmKleidiAI::MlasGemmPackBSize;
Expand Down
18 changes: 12 additions & 6 deletions onnxruntime/core/mlas/lib/qgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ Module Name:
#include "qgemm.h"

// TODO: When overrides are implemented, remove this
#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
#if defined(USE_KLEIDIAI)
#include "kleidiai/mlasi_kleidiai.h"
#endif

Expand Down Expand Up @@ -205,8 +205,14 @@ bool
MLASCALL
MlasIsDynamicQGemmAvailable()
{
#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
return ArmKleidiAI::UseSME2;
#if defined(USE_KLEIDIAI)
if(ArmKleidiAI::UseSME2) {
return ArmKleidiAI::UseSME2;
}
else {
return ArmKleidiAI::UseSME;
}

#else
return false;
#endif
Expand All @@ -222,7 +228,7 @@ MlasDynamicQGemmBatch (
) {
assert(MlasIsDynamicQGemmAvailable());

#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
#if defined(USE_KLEIDIAI)
//No fallback
ArmKleidiAI::MlasDynamicQGemmBatch(Shape, DataParams, BatchN, ThreadPool);
#endif
Expand Down Expand Up @@ -346,7 +352,7 @@ MlasDynamicQgemmPackBSize(
assert(MlasIsDynamicQGemmAvailable());

size_t bytes = 0;
#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
#if defined(USE_KLEIDIAI)
//No fallback available
//TODO: Insert Override
bytes = ArmKleidiAI::MlasDynamicQgemmPackBSize(N, K);
Expand Down Expand Up @@ -440,7 +446,7 @@ MlasDynamicQgemmPackB(
{
assert(MlasIsDynamicQGemmAvailable());

#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
#if defined(USE_KLEIDIAI)
//No fallback
ArmKleidiAI::MlasDynamicQgemmPackB(N, K, B, Scales, Bias, PackedB);
#endif
Expand Down
4 changes: 2 additions & 2 deletions onnxruntime/core/mlas/lib/sgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1670,7 +1670,7 @@ Return Value:
// Compute the number of bytes required to hold the packed buffer.
//
// KleidiAI or other override
#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
#if defined(USE_KLEIDIAI)
if (GetMlasPlatform().MlasGemmPackBSizeOverride != nullptr &&
// TODO: Remove once KAI supports transposing for A
TransA != CBLAS_TRANSPOSE::CblasTrans) {
Expand Down Expand Up @@ -1737,7 +1737,7 @@ Return Value:

--*/
{
#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
#if defined(USE_KLEIDIAI)
if (GetMlasPlatform().MlasGemmPackBOverride != nullptr &&
// TODO: Remove once KAI supports transposing for A
TransA != CBLAS_TRANSPOSE::CblasTrans &&
Expand Down
Loading