Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
517e166
Implement multithreading in qgemm_kleidi
melkap01-Arm Oct 14, 2025
bd05fce
fixes addressed:
melkap01-Arm Oct 30, 2025
75fee7a
lhs_base_table buffer implemented inside TLS
melkap01-Arm Oct 31, 2025
e53e67b
multithreaded qgemms coverage with single-multi threaded
melkap01-Arm Nov 3, 2025
c2428fb
Test commit damdoo01
damdoo01-arm Nov 10, 2025
0479aea
Undo Test commit damdoo01
damdoo01-arm Nov 10, 2025
9ef3b4c
SME2 test case check brought on the Test() function, after rebase
melkap01-Arm Nov 28, 2025
32bf43c
Merge branch 'microsoft:main' into melkap01_implement_mt_qgemm
melkap01-Arm Dec 4, 2025
b6ff3be
Dynamic Qgemm Prepack() refactored
melkap01-Arm Dec 4, 2025
7ed0e5c
Merge branch 'main' into melkap01_implement_mt_qgemm
melkap01-Arm Dec 5, 2025
76ba64f
Quant Kernel log added, include corrected in dynamic qgemm test
melkap01-Arm Dec 5, 2025
6bac9a5
Merge branch 'microsoft:main' into melkap01_implement_mt_qgemm
melkap01-Arm Dec 10, 2025
f1605e5
provider test cases for keldiai dynamic qgemms added
melkap01-Arm Dec 10, 2025
0c3748b
-Arm KleidiAI helper methods in Mlas space commented.
melkap01-Arm Dec 15, 2025
99fe8c5
Merge branch 'microsoft:main' into melkap01_implement_mt_qgemm
melkap01-Arm Dec 18, 2025
47e4c92
KleidiAI dynamic quantization supported by promoting 1D B tensor to 2D
melkap01-Arm Dec 19, 2025
fb8eefb
Merge branch 'microsoft:main' into melkap01_implement_mt_qgemm
melkap01-Arm Dec 22, 2025
d9a26bf
Merge branch 'microsoft:main' into melkap01_implement_mt_qgemm
melkap01-Arm Jan 2, 2026
cd80e56
Merge branch 'microsoft:main' into melkap01_implement_mt_qgemm
melkap01-Arm Jan 6, 2026
6356e68
Merge branch 'microsoft:main' into melkap01_implement_mt_qgemm
melkap01-Arm Jan 6, 2026
50dddaf
lintrunner issue fixed
melkap01-Arm Jan 6, 2026
017a425
Merge branch 'microsoft:main' into melkap01_implement_mt_qgemm
melkap01-Arm Jan 6, 2026
2ad388c
Merge branch 'microsoft:main' into melkap01_implement_mt_qgemm
melkap01-Arm Jan 7, 2026
8dc8bc3
Merge branch 'microsoft:main' into melkap01_implement_mt_qgemm
melkap01-Arm Jan 8, 2026
e000f04
Merge branch 'microsoft:main' into melkap01_implement_mt_qgemm
melkap01-Arm Jan 9, 2026
abfb0f3
Address feedback on testcases and other issues
JonathanC-ARM Jan 14, 2026
3bf22f2
Merge remote-tracking branch 'mel/main' into HEAD
JonathanC-ARM Jan 14, 2026
11c856c
Move dynamic quant tests to more appropriate file
JonathanC-ARM Jan 14, 2026
0c60f4e
Merge branch 'microsoft:main' into melkap01_implement_mt_qgemm
JonathanC-ARM Jan 15, 2026
ce02e98
Removal of comment and debug print in test case
JonathanC-ARM Jan 15, 2026
94d8fe5
Merge branch 'microsoft:main' into melkap01_implement_mt_qgemm
JonathanC-ARM Jan 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
151 changes: 17 additions & 134 deletions onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -164,132 +164,23 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
Status Compute(OpKernelContext* context) const override;

#if defined(USE_KLEIDIAI)
Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
/*out*/ bool& is_packed,
/*out*/ PrePackedWeights* prepacked_weights) override {
// only pack Matrix B
if (input_idx == GetBIdx()) {
const Tensor* b_zp_constant_tensor{nullptr};
bool b_quantization_might_be_asymmetric = false;

const OrtValue* b_zp;
if (Info().TryGetConstantInput(IN_B_ZERO_POINT, &b_zp)) {
b_zp_constant_tensor = &b_zp->Get<Tensor>();
}

// MlasDynamicQgemm requires symmetric quantization for B, so the B zero point value should either be all zeros
// or not provided.
if (b_zp_constant_tensor != nullptr) {
// B zero point is constant. Check if it is all zeros.
assert(b_zp_constant_tensor->IsDataType<uint8_t>() || b_zp_constant_tensor->IsDataType<int8_t>());
const auto* zp_bytes = static_cast<const std::byte*>(b_zp_constant_tensor->DataRaw());
const size_t zp_size_in_bytes = b_zp_constant_tensor->SizeInBytes();
b_quantization_might_be_asymmetric = std::any_of(zp_bytes, zp_bytes + zp_size_in_bytes,
[](std::byte v) { return v != std::byte{0}; });
} else {
// B zero point input is not constant. If it exists, we can't assume symmetric quantization.
const auto input_defs = Info().node().InputDefs();
const bool b_zp_input_exists = input_defs.size() > IN_B_ZERO_POINT && input_defs[IN_B_ZERO_POINT]->Exists();
b_quantization_might_be_asymmetric = b_zp_input_exists;
}

// MlasDynamicQgemm requires scale data to be available at packing stage
const Tensor* b_scale_tensor = nullptr;
const bool b_scale_available = Info().TryGetConstantInput(IN_B_SCALE, &b_scale_tensor);

can_use_dynamic_quant_mlas_ = (!b_quantization_might_be_asymmetric && b_scale_available);

// Kleidi dynamic path requires strictly positive, finite scales.
// Disable if any invalid scale is detected.
if (can_use_dynamic_quant_mlas_) {
const auto bs = b_scale_tensor->DataAsSpan<float>();
const bool has_invalid =
std::any_of(bs.begin(), bs.end(),
[](float s) { return !std::isfinite(s) || s <= 0.0f; });

if (has_invalid) {
can_use_dynamic_quant_mlas_ = false;
}
}

if (!MlasIsDynamicQGemmAvailable()) {
can_use_dynamic_quant_mlas_ = false;
}

// Only handle the common case of a 2D weight matrix. Additional matrices
// could be handled by stacking the packed buffers.
b_shape_ = tensor.Shape();
if (b_shape_.NumDimensions() >= 2) {
for (size_t i = 0; i < (b_shape_.NumDimensions() - 2); ++i) {
if (b_shape_[i] != 1) {
can_use_dynamic_quant_mlas_ = false;
break;
}
}
} else {
can_use_dynamic_quant_mlas_ = false;
}

// Can we use the mlas dynamic Q gemm interface supported with float output ?
if (!can_use_dynamic_quant_mlas_) {
// default to piece wise mlas interface with separate int matmul, quantize and float conversion
return MatMulIntegerToFloatBase::PrePack(tensor, input_idx, alloc, is_packed, prepacked_weights);
}
is_packed = false;

// Default to all zeros for bias
const Tensor* bias_tensor{nullptr};
const OrtValue* bias;
if (Info().TryGetConstantInput(IN_BIAS, &bias)) {
bias_tensor = &bias->Get<Tensor>();
dynamic_quant_mlas_bias_data_was_packed_ = true;
}
size_t K = static_cast<size_t>(b_shape_[0]);
size_t N = static_cast<size_t>(b_shape_[1]);

const auto* b_data = static_cast<const uint8_t*>(tensor.DataRaw());

std::optional<Tensor> b_trans_buffer;
if (IsBTransposed()) {
std::swap(K, N);
b_data = quantization::TransPoseInputData(b_data, b_trans_buffer, alloc, N, K);
}
bool SupportsKleidiaiDynamicQuant() const override {
if (!MlasIsDynamicQGemmAvailable()) {
return false;
}
return true;
}

const size_t packed_b_size = MlasDynamicQgemmPackBSize(N, K);
if (packed_b_size == 0) {
return Status::OK();
}
int GetBScaleIdx() const override {
return IN_B_SCALE;
}

packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size, true);
// Initialize memory to 0 as there could be some padding associated with pre-packed
// buffer memory and we do not want it uninitialized and generate different hashes
// if and when we try to cache this pre-packed buffer for sharing between sessions.
memset(packed_b_.get(), 0, packed_b_size);

const auto scales = static_cast<size_t>(b_scale_tensor->Shape().Size()) == N ? std::vector<float>(&b_scale_tensor->Data<float>()[0],
&b_scale_tensor->Data<float>()[N])
:
// Broadcast matrix scale to all channels
std::vector<float>(N, b_scale_tensor->Data<float>()[0]);

const auto biases = bias_tensor != nullptr ? std::vector<float>(&bias_tensor->Data<float>()[0],
&bias_tensor->Data<float>()[N])
:
// Broadcast zero to all channels - no bias data is available
std::vector<float>(N, 0.f);

MlasDynamicQgemmPackB(N, K, reinterpret_cast<const int8_t*>(b_data), scales.data(), biases.data(),
packed_b_.get());

bool share_prepacked_weights = (prepacked_weights != nullptr);
if (share_prepacked_weights) {
prepacked_weights->buffers_.push_back(std::move(packed_b_));
prepacked_weights->buffer_sizes_.push_back(packed_b_size);
}
int GetBZeroPointIdx() const override {
return IN_B_ZERO_POINT;
}

is_packed = true;
}
return Status::OK();
int GetBiasIdx() const override {
return IN_BIAS;
}
#endif

Expand All @@ -303,14 +194,6 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {

protected:
int GetBIdx() const override { return IN_B; }

private:
// Indicates when MlasDynamicQGemmBatch() can be used
bool can_use_dynamic_quant_mlas_{false};
#if defined(USE_KLEIDIAI)
// Indicates that the biases are a constant input and thus already quantized / packed
bool dynamic_quant_mlas_bias_data_was_packed_{false};
#endif
};

class MatMulIntegerToFloat final : public MatMulIntegerToFloatBase {
Expand Down Expand Up @@ -381,7 +264,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
}
}
// Guard against KleidiAI functions being called in non kleidi builds
// TODO: migrate to a suitable override function call for kleidi dynamic qgemm function calls
// migrate to a suitable override function call for kelidiai dynamic qgemm function calls
Copy link

Copilot AI Jan 16, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Corrected spelling of 'kelidiai' to 'KleidiAI'.

Suggested change
// migrate to a suitable override function call for kelidiai dynamic qgemm function calls
// migrate to a suitable override function call for KleidiAI dynamic qgemm function calls

Copilot uses AI. Check for mistakes.
#if defined(USE_KLEIDIAI)
else {
MatMulComputeHelper helper;
Expand All @@ -390,10 +273,10 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
// deleted during session init post prepacking
nullptr,
nullptr));

// allocate the kernel’s output tensor from the execution context
Tensor* y = ctx->Output(OUT_Y, helper.OutputShape());

// Bail out early if the output is going to be empty
// Bail out early if any dimension is 0, the product (and hence the total number of elements) is 0
if (y->Shape().Size() == 0)
return Status::OK();

Expand Down
1 change: 1 addition & 0 deletions onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ namespace ArmKleidiAI {

// By default we should try for SME2 first before falling back to SME.
inline const bool UseSME2 = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2();
inline const bool UseSME = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME();

// Buffer packing routines.
//
Expand Down
Loading