microsoft · jambayk · Jan 15, 2026 · Jan 30, 2025 · Jan 31, 2025 · Jan 31, 2025
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -45,6 +45,8 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp
   ${MLAS_SRC_DIR}/qnbitgemm.h
   ${MLAS_SRC_DIR}/qnbitgemm.cpp
+  ${MLAS_SRC_DIR}/qlutgemm.h
+  ${MLAS_SRC_DIR}/qlutgemm.cpp
   ${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h
   ${MLAS_SRC_DIR}/flashattn.cpp
   ${MLAS_SRC_DIR}/cast.cpp
@@ -209,6 +211,7 @@ function(setup_mlas_source_for_windows)
       ${MLAS_SRC_DIR}/qgemm_kernel_sse.cpp
       ${MLAS_SRC_DIR}/qgemm_kernel_sse41.cpp
       ${MLAS_SRC_DIR}/intrinsics/avx512/quantize_avx512f.cpp
+      ${MLAS_SRC_DIR}/sqnbitgemm_lut_kernel_avx2.cpp
       ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
       ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512.cpp
       ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx512vnni.cpp
@@ -693,6 +696,7 @@ else()
           ${MLAS_SRC_DIR}/intrinsics/avx2/qdwconv_avx2.cpp
           ${MLAS_SRC_DIR}/intrinsics/avx2/saturation_check_avx2.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_avx2.cpp
+          ${MLAS_SRC_DIR}/sqnbitgemm_lut_kernel_avx2.cpp
           ${MLAS_SRC_DIR}/rotary_embedding_kernel_avx2.h
           ${MLAS_SRC_DIR}/rotary_embedding_kernel_avx2.cpp
           ${MLAS_SRC_DIR}/rotary_embedding_kernel_avx2.cpp

diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h
@@ -374,6 +374,12 @@ static const char* const kOrtSessionOptionsEpContextModelExternalInitializersFil
 // - "1": Gemm FastMath mode is enabled.
 static const char* const kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16 = "mlas.enable_gemm_fastmath_arm64_bfloat16";
 
+// Use LUT based GEMM for quantized models when available.
+// Option values:
+// - "0": Do not use LUT based GEMM. [DEFAULT]
+// - "1": Use LUT based GEMM when available.
+static const char* const kOrtSessionOptionsMlasLUTGemm = "mlas.use_lut_gemm";
+
 // When converting DQ + MatMul -> MatMulNBits, the accuracy level of the MatMulNBits is controlled by this option.
 // Refer to MatMulNBits op schema for more details.
 // If not provided, default is 4.

diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc
@@ -15,7 +15,10 @@
 #include "core/mlas/inc/mlas_q4.h"
 #include "core/providers/cpu/math/matmul_helper.h"
 #include "core/providers/common.h"
+#include "core/session/onnxruntime_session_options_config_keys.h"
 #include "contrib_ops/cpu/quantization/matmul_nbits_helper.h"
+#include "core/platform/threadpool.h"
+#include "core/util/thread_utils.h"
 
 namespace onnxruntime {
 namespace contrib {
@@ -39,12 +42,14 @@
   Level2, /*!< input fp16, accumulator fp16 */
   Level3, /*!< input bf16, accumulator fp32 */
   Level4, /*!< input int8, accumulator int32 */
+  Level5, /*!< input uint8, use TMAC LUT approach  TODO: fix this comment*/
 } ACCURACY_LEVEL;
 
 // T: A data type.
 template <typename T>
 MLAS_QNBIT_GEMM_COMPUTE_TYPE
 GetComputeType(size_t nbits, size_t block_size, int64_t accuracy_level_attr) {
+
   // For Fp32, only accuracy level 1 or 4 makes sense.
   // non-ARM CPU converts Fp16 to Fp32.
   // By converting Fp32 to Fp16, precision becomes worse. And due to the casting,
@@ -54,6 +59,7 @@
     return SQNBIT_CompInt8;
   }
 
+
   return SQNBIT_CompFp32;
 }
 
@@ -100,6 +106,7 @@
         nbits_{narrow<size_t>(info.GetAttr<int64_t>("bits"))},
         has_g_idx_{info.GetInputCount() > InputIndex::g_idx && info.node().InputDefs()[InputIndex::g_idx]->Exists()},
         has_bias_{info.GetInputCount() > InputIndex::bias && info.node().InputDefs()[InputIndex::bias]->Exists()},
+        prefer_lut_gemm_{info.GetConfigOptions().GetConfigEntry(kOrtSessionOptionsMlasLUTGemm) == "1"},
         compute_type_{GetComputeType<T1>(nbits_, block_size_, info.GetAttr<int64_t>("accuracy_level"))} {
     const auto& node = info.node();
     auto input_defs = node.InputDefs();
@@ -116,6 +123,8 @@
                 "Only 2b, 4b and 8b quantization is supported for MatMulNBits op, additional bits support is planned.");
     const Tensor* tensor_zero_point = nullptr;
     has_zp_input_ = info.TryGetConstantInput(InputIndex::zero_points, &tensor_zero_point);
+    prefer_lut_gemm_ = true;
+    prefer_lut_gemm_ = prefer_lut_gemm_ && MlasIsLUTGemmAvailable(N_, K_, nbits_, block_size_);
   }
 
   Status Compute(OpKernelContext* context) const override;
@@ -135,11 +144,14 @@
   const bool has_g_idx_;
   const bool has_bias_;
   bool scales_are_packed_{false};
+  bool prefer_lut_gemm_{false};
   const MLAS_QNBIT_GEMM_COMPUTE_TYPE compute_type_;
   bool has_unquantized_zero_point_{false};
   const bool column_wise_quant_{true};
   IAllocatorUniquePtr<void> packed_b_{};
   size_t packed_b_size_{0};
+  IAllocatorUniquePtr<float> packed_scales_zp_{};
+  size_t packed_scales_zp_size_{0};
   IAllocatorUniquePtr<float> scales_fp32_{};
   IAllocatorUniquePtr<float> bias_fp32_{};
 
@@ -167,6 +179,15 @@
                         AllocatorPtr& allocator,
                         concurrency::ThreadPool* thread_pool,
                         const MatMulComputeHelper& helper) const;
+
+  Status ComputeBPackedLUT(const Tensor* a,
+                           const Tensor* scales,
+                           const Tensor* zero_points,
+                           const Tensor* bias,
+                           Tensor* y,
+                           AllocatorPtr& allocator,
+                           concurrency::ThreadPool* thread_pool,
+                           const MatMulComputeHelper& helper) const;
 };
 
 template <typename T1>
@@ -175,26 +196,62 @@
                                 /*out*/ PrePackedWeights* prepacked_weights) {
   ORT_UNUSED_PARAMETER(prepacked_weights);
   is_packed = false;
-  if (has_g_idx_ || has_unquantized_zero_point_) {
+  // if (has_g_idx_ || has_unquantized_zero_point_)
+  // TODO: this part modified so i can test ek atmulnbits
+  if (has_g_idx_) {
     return Status::OK();
   }
 
-  if (!MlasIsQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
+  if (!MlasIsQNBitGemmAvailable(nbits_, block_size_, compute_type_) && !prefer_lut_gemm_) {
     return Status::OK();
   }
+
+  // Create a temporary threadpool for parallel packing
+  // This is used during model load time to speed up weight prepacking
+  std::unique_ptr<concurrency::ThreadPool> temp_threadpool;
+  concurrency::ThreadPool* threadpool_ptr = nullptr;
+
+  // Only create threadpool for operations that can benefit from it
+  if (prefer_lut_gemm_ || compute_type_ == SQNBIT_CompInt8) {
+    OrtThreadPoolParams tpo;
+    tpo.thread_pool_size = 4;  // Use default (typically number of cores)
+    tpo.allow_spinning = false;  // Don't spin during model load
+    tpo.auto_set_affinity = false;
+
+    temp_threadpool = concurrency::CreateThreadPool(
+        &Env::Default(),
+        tpo,
+        concurrency::ThreadPoolType::INTRA_OP);
+
+    threadpool_ptr = temp_threadpool.get();
+  }
+
   if (input_idx == InputIndex::B) {
+
     const Tensor* scales = nullptr;
     OpKernel::Info().TryGetConstantInput(InputIndex::scales, &scales);
 
-    packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, has_zp_input_, compute_type_);
-    if (packed_b_size_ == 0) {
-      return Status::OK();
+    if (prefer_lut_gemm_) {
+      MlasInitLUTGemmKernelConfig(N_, K_, nbits_, block_size_, has_zp_input_);
+      packed_b_size_ = MlasLUTGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, has_zp_input_);
+      if (packed_b_size_ == 0) {
+        return Status::OK();
+      }
+      auto qptr = tensor.DataRaw();
+      packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
+      MlasLUTGemmPackQuantBData(N_, K_, nbits_, block_size_, static_cast<const std::byte*>(qptr), static_cast<std::byte*>(packed_b_.get()), threadpool_ptr);
+    } else {
+      packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, has_zp_input_, compute_type_);
+      if (packed_b_size_ == 0) {
+        return Status::OK();
+      }
+      auto qptr = tensor.DataRaw();
+      auto scale_ptr = scales ? scales->DataRaw() : nullptr;
+      packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
+      MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), scale_ptr,
+                                  has_zp_input_, nullptr, threadpool_ptr);
+
     }
-    auto qptr = tensor.DataRaw();
-    auto scale_ptr = scales ? scales->DataRaw() : nullptr;
-    packed_b_ = IAllocator::MakeUniquePtr<void>(alloc, packed_b_size_, true);
-    MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), scale_ptr,
-                                has_zp_input_, nullptr, nullptr);
     is_packed = true;
   } else if (compute_type_ == SQNBIT_CompInt8) {
     // Packing scales and zero points
@@ -230,8 +287,26 @@
       is_packed = true;
     }
 #endif  // MLAS_TARGET_ARM64
+  } else if (prefer_lut_gemm_) {
+    if (input_idx == InputIndex::scales && packed_b_ != nullptr) {
+      auto scales_ptr = tensor.Data<float>();
+      packed_scales_zp_size_ = MlasLUTPackScalesAndZeroPointsSize(N_, K_, block_size_, has_zp_input_);
+      packed_scales_zp_ = IAllocator::MakeUniquePtr<float>(alloc, packed_scales_zp_size_, true);
+
+      // TODO(vraspar): improve this logic block
+      if (has_zp_input_) {
+        const Tensor* zero_points = nullptr;
+        OpKernel::Info().TryGetConstantInput(InputIndex::zero_points, &zero_points);
+        auto zero_points_ptr = zero_points->Data<uint8_t>();
+        MlasLUTPackScalesAndZeroPoints(N_, K_, nbits_, block_size_, has_zp_input_, packed_scales_zp_.get(), scales_ptr, zero_points_ptr);
+      } else {
+        MlasLUTPackScalesAndZeroPoints(N_, K_, nbits_, block_size_, has_zp_input_, packed_scales_zp_.get(), scales_ptr, nullptr);
+      }
+    }
   }
 
+  // Threadpool will be automatically destroyed when temp_threadpool goes out of scope
+
   return Status::OK();
 }
 
@@ -296,7 +371,7 @@
       is_packed = false;
     }
 #endif  // MLAS_TARGET_AMD64_IX86
-  }
+  } 
 
   return Status::OK();
 }
@@ -307,14 +382,38 @@
                                                   /*out*/ bool& used_shared_buffers) {
   used_shared_buffers = false;
 
-  if (input_idx == 1) {
+  if (input_idx == 1) { //TODO(vraspar): DO we need shared Prepacked buffer for TMAC, combine packing of weights + scales/ZP into one buffer ???
     used_shared_buffers = true;
     packed_b_ = std::move(prepacked_buffers[0]);
   }
 
   return Status::OK();
 }
 
+template<typename T1>
+Status MatMulNBits<T1>::ComputeBPackedLUT(const Tensor* a,
+                                          const Tensor* scales,
+                                          const Tensor* zero_points,
+                                          const Tensor* bias,
+                                          Tensor* y,
+                                          AllocatorPtr& allocator,
+                                          concurrency::ThreadPool* thread_pool,
+                                          const MatMulComputeHelper& helper) const {
+  const auto* a_data = a->Data<T1>();
+  const auto* scales_data = scales == nullptr ? nullptr : scales->Data<T1>();
+  const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->DataRaw();
+  const auto* bias_data = bias == nullptr ? nullptr : bias->Data<T1>();
+  auto* y_data = y->MutableData<T1>();
+  const size_t batch_count = helper.OutputOffsets().size();
+  const size_t M = static_cast<size_t>(helper.M());
+  const size_t N = static_cast<size_t>(helper.N());
+  const size_t K = static_cast<size_t>(helper.K());
+  // TODO(vraspar): Should we batch it here?
+  //MlasInitLUTGemmKernelConfig(N, K, nbits_, block_size_, has_zp_input_);
+  MlasLUTGemm(a_data, block_size_, packed_b_.get(), packed_scales_zp_.get(), y_data, K, M, N, thread_pool);
+  return Status::OK();
+}
+
 template <typename T1>
 Status MatMulNBits<T1>::ComputeBPacked(const Tensor* a,
                                        const Tensor* scales,
@@ -334,6 +433,7 @@
   const size_t M = static_cast<size_t>(helper.M());
   const size_t N = static_cast<size_t>(helper.N());
   const size_t K = static_cast<size_t>(helper.K());
+
   const size_t lda = helper.Lda(false);
 
   IAllocatorUniquePtr<std::byte> workspace{};
@@ -774,6 +874,10 @@
                     // If this changes, i.e., if MlasIsQNBitGemmAvailable() can return true while
                     // MlasQNBitGemmPackQuantBDataSize() returns 0, we can consider calling MlasQNBitGemmBatch()
                     // with B directly too.
+    if (prefer_lut_gemm_) {
+      return ComputeBPackedLUT(a, scales, zero_points, bias, y, allocator, thread_pool, helper);
+    }
+
     if (MlasIsQNBitGemmAvailable(nbits_, block_size_, compute_type_)) {
       return ComputeBPacked(a, scales, zero_points, bias, y, allocator, thread_pool, helper);
     }

diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc
@@ -16,6 +16,15 @@
 namespace onnxruntime {
 namespace contrib {
 
+template <class T, class zeroT>
+void Dequantize2BitsKernelReOrder(
+    T* /*output*/, const uint8_t* /*quant_data*/, const T* /*scale_data*/,
+    const zeroT* /*zero_points*/, const int32_t* /*reorder_idx*/, int /*block_size*/,
+    int /*groups_per_threadblock*/, int /*total_groups*/, int /*out_rows*/, int /*out_cols*/,
+    int /*blockIdx_x*/, int /*threadIdx_x*/) {
+  assert(false);
+}
+
 template <class T, class zeroT>
 void Dequantize4BitsKernelReOrder(
     T* output, const uint8_t* quant_data, const T* scale_data,
@@ -73,7 +82,7 @@ void Dequantize4BitsKernelReOrder(
   }
 }
 
-template <typename inputT, typename zeroT>
+template <typename inputT, typename zeroT, int qbits>
 void DequantizeBlockwise(
     inputT* output,              // dequantized output
     const uint8_t* quant_data,   // quantized input
@@ -95,24 +104,35 @@ void DequantizeBlockwise(
       pool, static_cast<std::ptrdiff_t>(blocks_per_grid),
       [&](std::ptrdiff_t block_id) {
         for (int j = 0; j < 256; j++) {
-          Dequantize4BitsKernelReOrder(output, quant_data, scales_data, zero_points,
-                                       reorder_idx, block_size, groups_per_threadblock,
-                                       total_groups, N, K, static_cast<int>(block_id), j);
+          if constexpr (qbits == 2) {
+            Dequantize2BitsKernelReOrder(output, quant_data, scales_data, zero_points,
+                                         reorder_idx, block_size, groups_per_threadblock,
+                                         total_groups, N, K, static_cast<int>(block_id), j);
+          } else {
+            Dequantize4BitsKernelReOrder(output, quant_data, scales_data, zero_points,
+                                         reorder_idx, block_size, groups_per_threadblock,
+                                         total_groups, N, K, static_cast<int>(block_id), j);
+          }
         }
       });
 }
 
-template void DequantizeBlockwise<float, uint8_t>(
+template void DequantizeBlockwise<float, uint8_t, 2>(
+    float* output, const uint8_t* quant_data, const float* scales_data,
+    const uint8_t* zero_points, const int32_t* reorder_idx, int32_t block_size,
+    bool columnwise, int32_t K, int32_t N, onnxruntime::concurrency::ThreadPool* thread_pool);
+
+template void DequantizeBlockwise<float, uint8_t, 4>(
     float* output, const uint8_t* quant_data, const float* scales_data,
     const uint8_t* zero_points, const int32_t* reorder_idx, int32_t block_size,
     bool columnwise, int32_t K, int32_t N, onnxruntime::concurrency::ThreadPool* thread_pool);
 
-template void DequantizeBlockwise<float, float>(
+template void DequantizeBlockwise<float, float, 4>(
     float* output, const uint8_t* quant_data, const float* scales_data,
     const float* zero_points, const int32_t* reorder_idx, int32_t block_size,
     bool columnwise, int32_t K, int32_t N, onnxruntime::concurrency::ThreadPool* thread_pool);
 
-template void DequantizeBlockwise<float, MLFloat16>(
+template void DequantizeBlockwise<float, MLFloat16, 4>(
     float* output, const uint8_t* quant_data, const float* scales_data,
     const MLFloat16* zero_points, const int32_t* reorder_idx, int32_t block_size,
     bool columnwise, int32_t K, int32_t N, onnxruntime::concurrency::ThreadPool* thread_pool);

diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.h b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.h
@@ -6,7 +6,7 @@
 namespace onnxruntime {
 namespace contrib {
 
-template <typename inputT, typename zeroT>
+template <typename inputT, typename zeroT, int qbits = 4>
 void DequantizeBlockwise(
     inputT* output,              // dequantized output
     const uint8_t* quant_data,   // quantized input