feat: support attention indexer on mlu for deepseek v3.2 prerequisite. (#311)

a120092009 · phantomlei3 · web-flow · commit 2e968096fb89 · 2025-11-05T10:44:48.000+08:00
Co-authored-by: phantomlei &lt;phantomlei3@gmail.com&gt;
diff --git a/cibuild/build_mlu.sh b/cibuild/build_mlu.sh
@@ -6,7 +6,7 @@ function error() {
   exit 1
 }
 
-IMAGE="cambricon-base/pytorch:v25.06.0-torch2.7.1-torchmlu1.27.2-ubuntu22.04-py310_xllm251016"
+IMAGE="cambricon-base/pytorch:v25.06.0-torch2.7.1-torchmlu1.27.2-ubuntu22.04-py310_xllm251104"
 
 RUN_OPTS=(
   --rm
diff --git a/xllm/core/framework/model/CMakeLists.txt b/xllm/core/framework/model/CMakeLists.txt
@@ -12,6 +12,7 @@ set(BASE_DEPS
   :chat_template
   glog::glog
   torch
+  torch_python
 )
 
 if(USE_NPU)
diff --git a/xllm/core/kernels/mlu/attention.cpp b/xllm/core/kernels/mlu/attention.cpp
@@ -18,9 +18,9 @@ limitations under the License.
 namespace xllm::kernel::mlu {
 
 void reshape_paged_cache(torch::Tensor& key,
-                         torch::Tensor& value,
+                         const std::optional<torch::Tensor>& value,
                          torch::Tensor& k_cache,
-                         torch::Tensor& v_cache,
+                         const std::optional<torch::Tensor>& v_cache,
                          const torch::Tensor& slot_mapping,
                          bool direction) {
   tmo::torch_api::reshape_paged_cache(
@@ -115,4 +115,40 @@ void batch_decode(const torch::Tensor& query,
                                               kv_cache_quant_bit_size);
 }
 
+void masked_indexer_select_paged_kv(const bool is_prefill,
+                                    const torch::Tensor& query,
+                                    const torch::Tensor& cu_seq_q_lens,
+                                    const torch::Tensor& cu_seq_k_lens,
+                                    const torch::Tensor& q_scale,
+                                    const torch::Tensor& weights,
+                                    const double softmax_scale,
+                                    const torch::Tensor& k_cache,
+                                    const torch::Tensor& k_context_lens,
+                                    const torch::Tensor& k_cache_block_table,
+                                    const torch::Tensor& k_scale_cache,
+                                    const int64_t index_topk,
+                                    const torch::Tensor& kv_cache_block_table,
+                                    const int64_t kv_cache_block_size,
+                                    const torch::Tensor& new_block_table,
+                                    const torch::Tensor& new_context_lens,
+                                    const int64_t quant_block_size) {
+  tmo::torch_api::masked_indexer_select_paged_kv(is_prefill,
+                                                 query,
+                                                 cu_seq_q_lens,
+                                                 cu_seq_k_lens,
+                                                 q_scale,
+                                                 weights,
+                                                 softmax_scale,
+                                                 k_cache,
+                                                 k_context_lens,
+                                                 k_cache_block_table,
+                                                 k_scale_cache,
+                                                 index_topk,
+                                                 kv_cache_block_table,
+                                                 kv_cache_block_size,
+                                                 new_block_table,
+                                                 new_context_lens,
+                                                 quant_block_size);
+}
+
 }  // namespace xllm::kernel::mlu
diff --git a/xllm/core/kernels/mlu/fused_moe.cpp b/xllm/core/kernels/mlu/fused_moe.cpp
@@ -166,12 +166,15 @@ torch::Tensor fused_moe(
       /*b_scale=*/w1_scale.has_value() ? std::make_optional(w1_scale.value())
                                        : std::nullopt,
       /*bias=*/std::nullopt,
+      /*a_calibration=*/std::nullopt,
+      /*b_calibration=*/std::nullopt,
       /*quant_flag=*/w1_quant_flag.has_value() ? w1_quant_flag : std::nullopt,
       /*b_offset=*/std::nullopt,
       /*tile_config=*/std::nullopt,
       /*max_dim=*/tokens,
       /*trans_a=*/false,
-      /*trans_b=*/true);
+      /*trans_b=*/true,
+      /*a_quant_bit=*/is_smoothquant ? 8 : -1);
 
   // prepare the parameters for the second group gemm
   torch::Tensor act_out;
@@ -231,12 +234,15 @@ torch::Tensor fused_moe(
       w2_scale.has_value() ? std::make_optional(w2_scale.value())
                            : std::nullopt,  // b_scale
       /*bias=*/std::nullopt,
+      /*a_calibration=*/std::nullopt,
+      /*b_calibration=*/std::nullopt,
       w2_quant_flag.has_value() ? w2_quant_flag : std::nullopt,  // quant_flag
       /*b_offset=*/std::nullopt,
       /*tile_config=*/std::nullopt,
-      tokens,  // max_dim
+      /*max_dim=*/tokens,
       /*trans_a=*/false,
-      /*trans_b=*/true);
+      /*trans_b=*/true,
+      /*a_quant_bit=*/is_smoothquant ? 8 : -1);
 
   auto output = torch::empty({reduce_weight.size(0), gemm2_out.size(1)},
                              gemm2_out.options());
diff --git a/xllm/core/kernels/mlu/mlu_ops_api.h b/xllm/core/kernels/mlu/mlu_ops_api.h
@@ -52,9 +52,9 @@ void active(const torch::Tensor& input,
             int expert_size);
 
 void reshape_paged_cache(torch::Tensor& key,
-                         torch::Tensor& value,
+                         const std::optional<torch::Tensor>& value,
                          torch::Tensor& k_cache,
-                         torch::Tensor& v_cache,
+                         const std::optional<torch::Tensor>& v_cache,
                          const torch::Tensor& slot_mapping,
                          bool direction);
 
@@ -102,6 +102,24 @@ void batch_decode(const torch::Tensor& query,
                   bool return_lse,
                   int kv_cache_quant_bit_size);
 
+void masked_indexer_select_paged_kv(const bool is_prefill,
+                                    const torch::Tensor& query,
+                                    const torch::Tensor& cu_seq_q_lens,
+                                    const torch::Tensor& cu_seq_k_lens,
+                                    const torch::Tensor& q_scale,
+                                    const torch::Tensor& weights,
+                                    const double softmax_scale,
+                                    const torch::Tensor& k_cache,
+                                    const torch::Tensor& k_context_lens,
+                                    const torch::Tensor& k_cache_block_table,
+                                    const torch::Tensor& k_scale_cache,
+                                    const int64_t index_topk,
+                                    const torch::Tensor& kv_cache_block_table,
+                                    const int64_t kv_cache_block_size,
+                                    const torch::Tensor& new_block_table,
+                                    const torch::Tensor& new_context_lens,
+                                    const int64_t quant_block_size);
+
 void fused_layernorm(const torch::Tensor& input,
                      torch::Tensor& output,
                      const std::optional<torch::Tensor>& residual,
diff --git a/xllm/core/kernels/mlu/scaled_matmul.cpp b/xllm/core/kernels/mlu/scaled_matmul.cpp
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <glog/logging.h>
+
 #include "mlu_ops_api.h"
 
 namespace xllm::kernel::mlu {
@@ -36,14 +38,12 @@ torch::Tensor scaled_matmul(
     const std::optional<torch::Tensor>& output /* = std::nullopt */
 ) {
   // Check: only support w8a8 quantization for now.
-  TORCH_CHECK(quant_bit_size == 8 && a_quant_bit_size == 8,
-              "scaled_matmul only supports w8a8 quantization (quant_bit_size "
-              "== 8, a_quant_bit_size == 8) for now. "
-              "Got quant_bit_size = ",
-              quant_bit_size,
-              ", a_quant_bit_size = ",
-              a_quant_bit_size,
-              ".");
+  CHECK(quant_bit_size == 8 && a_quant_bit_size == 8)
+      << "scaled_matmul only supports w8a8 quantization (quant_bit_size "
+         "scaled_matmul only supports w8a8 quantization (quant_bit_size "
+         "== 8, a_quant_bit_size == 8) for now. "
+         "Got quant_bit_size = "
+      << quant_bit_size << ", a_quant_bit_size = " << a_quant_bit_size;
 
   // Only support smooth_quant algorithm for now
   std::string quant_algo = "smooth_quant";
@@ -63,10 +63,8 @@ torch::Tensor scaled_matmul(
   at::ScalarType torch_half = at::ScalarType::Half;
   at::ScalarType torch_bfloat16 = at::ScalarType::BFloat16;
 
-  TORCH_CHECK(output_dtype == torch_half || output_dtype == torch_bfloat16,
-              "output dtype must be half or bfloat16, but got: ",
-              output_dtype,
-              ".");
+  CHECK(output_dtype == torch_half || output_dtype == torch_bfloat16)
+      << "output dtype must be half or bfloat16, but got: " << output_dtype;
 
   // Select output tensor
   torch::Tensor output_tensor;
diff --git a/xllm/core/kernels/ops_api.cpp b/xllm/core/kernels/ops_api.cpp
@@ -246,5 +246,30 @@ torch::Tensor random_sample(RandomSampleParams& params) {
   throw std::runtime_error("random_sample not implemented");
 #endif
 }
+
+void masked_indexer_select_paged_kv(MaskedIndexerSelectPagedKVParams& params) {
+#if defined(USE_MLU)
+  mlu::masked_indexer_select_paged_kv(params.is_prefill,
+                                      params.query,
+                                      params.cu_seq_q_lens,
+                                      params.cu_seq_k_lens,
+                                      params.q_scale,
+                                      params.weights,
+                                      params.softmax_scale,
+                                      params.k_cache,
+                                      params.k_context_lens,
+                                      params.k_cache_block_table,
+                                      params.k_scale_cache,
+                                      params.index_topk,
+                                      params.kv_cache_block_table,
+                                      params.kv_cache_block_size,
+                                      params.new_block_table,
+                                      params.new_context_lens,
+                                      params.quant_block_size);
+#else
+  throw std::runtime_error("masked_indexer_select_paged_kv not implemented");
+#endif
+}
+
 }  // namespace kernel
 }  // namespace xllm
diff --git a/xllm/core/kernels/ops_api.h b/xllm/core/kernels/ops_api.h
@@ -49,5 +49,7 @@ torch::Tensor apply_top_k_top_p(TopKPParams& params);
 
 torch::Tensor random_sample(RandomSampleParams& params);
 
+void masked_indexer_select_paged_kv(MaskedIndexerSelectPagedKVParams& params);
+
 }  // namespace kernel
 }  // namespace xllm
diff --git a/xllm/core/kernels/param.h b/xllm/core/kernels/param.h
@@ -56,9 +56,9 @@ struct ActivationParams {
 // Reshape paged cache parameters
 struct ReshapePagedCacheParams {
   torch::Tensor key;
-  torch::Tensor value;
+  std::optional<torch::Tensor> value;
   torch::Tensor k_cache;
-  torch::Tensor v_cache;
+  std::optional<torch::Tensor> v_cache;
   torch::Tensor slot_mapping;
   bool direction = false;
 };
@@ -220,5 +220,27 @@ struct TopKPParams {
 struct RandomSampleParams {
   torch::Tensor logits;
 };
+
+// Masked indexer select paged kv parameters
+struct MaskedIndexerSelectPagedKVParams {
+  bool is_prefill;
+  torch::Tensor query;
+  torch::Tensor cu_seq_q_lens;
+  torch::Tensor cu_seq_k_lens;
+  torch::Tensor q_scale;
+  torch::Tensor weights;
+  double softmax_scale;
+  torch::Tensor k_cache;
+  torch::Tensor k_context_lens;
+  torch::Tensor k_cache_block_table;
+  torch::Tensor k_scale_cache;
+  int64_t index_topk;
+  torch::Tensor kv_cache_block_table;
+  int64_t kv_cache_block_size;
+  torch::Tensor new_block_table;
+  torch::Tensor new_context_lens;
+  int64_t quant_block_size;
+};
+
 }  // namespace kernel
 }  // namespace xllm
diff --git a/xllm/core/layers/common/CMakeLists.txt b/xllm/core/layers/common/CMakeLists.txt
@@ -17,6 +17,7 @@ cc_library(
     linear_impl.h
     word_embedding_impl.h
     layer_utils.h
+    indexer.h
   SRCS
     qwen3_attention.cpp
     attention.cpp
@@ -28,6 +29,7 @@ cc_library(
     qwen3_moe_decoder_layer.cpp
     linear_impl.cpp
     layer_utils.cpp
+    indexer.cpp
   DEPS
     "-Wl,--whole-archive"
     "-Wl,--no-whole-archive"
@@ -76,3 +78,20 @@ cc_test(
     torch
     GTest::gtest_main
 )
+
+# Add test for Indexer
+cc_test(
+  NAME
+    indexer_test
+  SRCS
+    tests/indexer_tests.cpp
+    tests/tests_utils.cpp
+  DEPS
+    :common_layers
+    :parallel_state
+    :model
+    :state_dict
+    glog::glog
+    torch
+    GTest::gtest_main
+)
diff --git a/xllm/core/layers/common/indexer.cpp b/xllm/core/layers/common/indexer.cpp
diff --git a/xllm/core/layers/common/indexer.h b/xllm/core/layers/common/indexer.h
diff --git a/xllm/core/layers/common/linear_impl.cpp b/xllm/core/layers/common/linear_impl.cpp
diff --git a/xllm/core/layers/common/tests/indexer_tests.cpp b/xllm/core/layers/common/tests/indexer_tests.cpp
diff --git a/xllm/core/layers/common/tests/tests_utils.h b/xllm/core/layers/common/tests/tests_utils.h

Original file line number	Diff line number	Diff line change
`@@ -6,7 +6,7 @@ function error() {`
`6`	`6`	`exit 1`
`7`	`7`	`}`
`8`	`8`
`9`		`-IMAGE="cambricon-base/pytorch:v25.06.0-torch2.7.1-torchmlu1.27.2-ubuntu22.04-py310_xllm251016"`
	`9`	`+IMAGE="cambricon-base/pytorch:v25.06.0-torch2.7.1-torchmlu1.27.2-ubuntu22.04-py310_xllm251104"`
`10`	`10`
`11`	`11`	`RUN_OPTS=(`
`12`	`12`	`--rm`
Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,7 @@ set(BASE_DEPS`
`12`	`12`	`:chat_template`
`13`	`13`	`glog::glog`
`14`	`14`	`torch`
	`15`	`+ torch_python`
`15`	`16`	`)`
`16`	`17`
`17`	`18`	`if(USE_NPU)`