microsoft · melkap01-Arm · Oct 14, 2025 · Oct 30, 2025 · Oct 31, 2025 · Nov 3, 2025
diff --git a/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc b/onnxruntime/contrib_ops/cpu/quantization/dynamic_quantize_matmul.cc
@@ -164,132 +164,23 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
   Status Compute(OpKernelContext* context) const override;
 
 #if defined(USE_KLEIDIAI)
-  Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc,
-                 /*out*/ bool& is_packed,
-                 /*out*/ PrePackedWeights* prepacked_weights) override {
-    // only pack Matrix B
-    if (input_idx == GetBIdx()) {
-      const Tensor* b_zp_constant_tensor{nullptr};
-      bool b_quantization_might_be_asymmetric = false;
-
-      const OrtValue* b_zp;
-      if (Info().TryGetConstantInput(IN_B_ZERO_POINT, &b_zp)) {
-        b_zp_constant_tensor = &b_zp->Get<Tensor>();
-      }
-
-      // MlasDynamicQgemm requires symmetric quantization for B, so the B zero point value should either be all zeros
-      // or not provided.
-      if (b_zp_constant_tensor != nullptr) {
-        // B zero point is constant. Check if it is all zeros.
-        assert(b_zp_constant_tensor->IsDataType<uint8_t>() || b_zp_constant_tensor->IsDataType<int8_t>());
-        const auto* zp_bytes = static_cast<const std::byte*>(b_zp_constant_tensor->DataRaw());
-        const size_t zp_size_in_bytes = b_zp_constant_tensor->SizeInBytes();
-        b_quantization_might_be_asymmetric = std::any_of(zp_bytes, zp_bytes + zp_size_in_bytes,
-                                                         [](std::byte v) { return v != std::byte{0}; });
-      } else {
-        // B zero point input is not constant. If it exists, we can't assume symmetric quantization.
-        const auto input_defs = Info().node().InputDefs();
-        const bool b_zp_input_exists = input_defs.size() > IN_B_ZERO_POINT && input_defs[IN_B_ZERO_POINT]->Exists();
-        b_quantization_might_be_asymmetric = b_zp_input_exists;
-      }
-
-      // MlasDynamicQgemm requires scale data to be available at packing stage
-      const Tensor* b_scale_tensor = nullptr;
-      const bool b_scale_available = Info().TryGetConstantInput(IN_B_SCALE, &b_scale_tensor);
-
-      can_use_dynamic_quant_mlas_ = (!b_quantization_might_be_asymmetric && b_scale_available);
-
-      // Kleidi dynamic path requires strictly positive, finite scales.
-      // Disable if any invalid scale is detected.
-      if (can_use_dynamic_quant_mlas_) {
-        const auto bs = b_scale_tensor->DataAsSpan<float>();
-        const bool has_invalid =
-            std::any_of(bs.begin(), bs.end(),
-                        [](float s) { return !std::isfinite(s) || s <= 0.0f; });
-
-        if (has_invalid) {
-          can_use_dynamic_quant_mlas_ = false;
-        }
-      }
-
-      if (!MlasIsDynamicQGemmAvailable()) {
-        can_use_dynamic_quant_mlas_ = false;
-      }
-
-      // Only handle the common case of a 2D weight matrix. Additional matrices
-      // could be handled by stacking the packed buffers.
-      b_shape_ = tensor.Shape();
-      if (b_shape_.NumDimensions() >= 2) {
-        for (size_t i = 0; i < (b_shape_.NumDimensions() - 2); ++i) {
-          if (b_shape_[i] != 1) {
-            can_use_dynamic_quant_mlas_ = false;
-            break;
-          }
-        }
-      } else {
-        can_use_dynamic_quant_mlas_ = false;
-      }
-
-      // Can we use the mlas dynamic Q gemm interface supported with float output ?
-      if (!can_use_dynamic_quant_mlas_) {
-        // default to piece wise mlas interface with separate int matmul, quantize and float conversion
-        return MatMulIntegerToFloatBase::PrePack(tensor, input_idx, alloc, is_packed, prepacked_weights);
-      }
-      is_packed = false;
-
-      // Default to all zeros for bias
-      const Tensor* bias_tensor{nullptr};
-      const OrtValue* bias;
-      if (Info().TryGetConstantInput(IN_BIAS, &bias)) {
-        bias_tensor = &bias->Get<Tensor>();
-        dynamic_quant_mlas_bias_data_was_packed_ = true;
-      }
-      size_t K = static_cast<size_t>(b_shape_[0]);
-      size_t N = static_cast<size_t>(b_shape_[1]);
-
-      const auto* b_data = static_cast<const uint8_t*>(tensor.DataRaw());
-
-      std::optional<Tensor> b_trans_buffer;
-      if (IsBTransposed()) {
-        std::swap(K, N);
-        b_data = quantization::TransPoseInputData(b_data, b_trans_buffer, alloc, N, K);
-      }
+  bool SupportsKleidiaiDynamicQuant() const override {
+    if (!MlasIsDynamicQGemmAvailable()) {
+      return false;
+    }
+    return true;
+  }
 
-      const size_t packed_b_size = MlasDynamicQgemmPackBSize(N, K);
-      if (packed_b_size == 0) {
-        return Status::OK();
-      }
+  int GetBScaleIdx() const override {
+    return IN_B_SCALE;
+  }
 
-      packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size, true);
-      // Initialize memory to 0 as there could be some padding associated with pre-packed
-      // buffer memory and we do not want it uninitialized and generate different hashes
-      // if and when we try to cache this pre-packed buffer for sharing between sessions.
-      memset(packed_b_.get(), 0, packed_b_size);
-
-      const auto scales = static_cast<size_t>(b_scale_tensor->Shape().Size()) == N ? std::vector<float>(&b_scale_tensor->Data<float>()[0],
-                                                                                                        &b_scale_tensor->Data<float>()[N])
-                                                                                   :
-                                                                                   // Broadcast matrix scale to all channels
-                              std::vector<float>(N, b_scale_tensor->Data<float>()[0]);
-
-      const auto biases = bias_tensor != nullptr ? std::vector<float>(&bias_tensor->Data<float>()[0],
-                                                                      &bias_tensor->Data<float>()[N])
-                                                 :
-                                                 // Broadcast zero to all channels - no bias data is available
-                              std::vector<float>(N, 0.f);
-
-      MlasDynamicQgemmPackB(N, K, reinterpret_cast<const int8_t*>(b_data), scales.data(), biases.data(),
-                            packed_b_.get());
-
-      bool share_prepacked_weights = (prepacked_weights != nullptr);
-      if (share_prepacked_weights) {
-        prepacked_weights->buffers_.push_back(std::move(packed_b_));
-        prepacked_weights->buffer_sizes_.push_back(packed_b_size);
-      }
+  int GetBZeroPointIdx() const override {
+    return IN_B_ZERO_POINT;
+  }
 
-      is_packed = true;
-    }
-    return Status::OK();
+  int GetBiasIdx() const override {
+    return IN_BIAS;
   }
 #endif
 
@@ -303,14 +194,6 @@ class DynamicQuantizeMatMul final : public MatMulIntegerToFloatBase {
 
  protected:
   int GetBIdx() const override { return IN_B; }
-
- private:
-  // Indicates when MlasDynamicQGemmBatch() can be used
-  bool can_use_dynamic_quant_mlas_{false};
-#if defined(USE_KLEIDIAI)
-  // Indicates that the biases are a constant input and thus already quantized / packed
-  bool dynamic_quant_mlas_bias_data_was_packed_{false};
-#endif
 };
 
 class MatMulIntegerToFloat final : public MatMulIntegerToFloatBase {
@@ -381,7 +264,7 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
     }
   }
   // Guard against KleidiAI functions being called in non kleidi builds
-  // TODO: migrate to a suitable override function call for kleidi dynamic qgemm function calls
+  // migrate to a suitable override function call for kelidiai dynamic qgemm function calls
-  // migrate to a suitable override function call for kelidiai dynamic qgemm function calls
+  // migrate to a suitable override function call for KleidiAI dynamic qgemm function calls
-  // migrate to a suitable override function call for kelidiai dynamic qgemm function calls
+  // migrate to a suitable override function call for KleidiAI dynamic qgemm function calls
 #if defined(USE_KLEIDIAI)
   else {
     MatMulComputeHelper helper;
@@ -390,10 +273,10 @@ Status DynamicQuantizeMatMul::Compute(OpKernelContext* ctx) const {
                                        // deleted during session init post prepacking
                                        nullptr,
                                        nullptr));
-
+    // allocate the kernel’s output tensor from the execution context
     Tensor* y = ctx->Output(OUT_Y, helper.OutputShape());
 
-    // Bail out early if the output is going to be empty
+    // Bail out early if any dimension is 0, the product (and hence the total number of elements) is 0
     if (y->Shape().Size() == 0)
       return Status::OK();
 

diff --git a/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h b/onnxruntime/core/mlas/lib/kleidiai/mlasi_kleidiai.h
@@ -53,6 +53,7 @@ namespace ArmKleidiAI {
 
 // By default we should try for SME2 first before falling back to SME.
 inline const bool UseSME2 = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME2();
+inline const bool UseSME = MLAS_CPUIDINFO::GetCPUIDInfo().HasArm_SME();
 
 // Buffer packing routines.
 //