init moe interface

airMeng · airMeng · commit eae76dbfe15e · 2025-10-31T15:06:33.000+08:00
diff --git a/include/sgl_kernel_ops.h b/include/sgl_kernel_ops.h
@@ -255,7 +255,7 @@ void fp8_blockwise_scaled_grouped_mm(
     const torch::Tensor& expert_offsets,
     const torch::Tensor& workspace);
 
-void moe_grouped_mm_nt(
+void moe_grouped_mm_nn(
     torch::Tensor& output,
     const torch::Tensor& activations,
     const torch::Tensor& weights,
diff --git a/python/sgl_kernel/__init__.py b/python/sgl_kernel/__init__.py
@@ -51,9 +51,10 @@
     apply_shuffle_mul_sum,
     cutlass_fp4_group_mm,
     fp8_blockwise_scaled_grouped_mm,
+    fused_experts,
     moe_align_block_size,
+    moe_align_block_size_impl,
     moe_fused_gate,
-    moe_grouped_mm_nt,
     moe_sum,
     moe_sum_reduce,
     prepare_moe_input,
diff --git a/python/sgl_kernel/moe.py b/python/sgl_kernel/moe.py
@@ -3,7 +3,7 @@
 import torch
 
 
-def moe_align_block_size(
+def moe_align_block_size_impl(
     topk_ids,
     num_experts,
     block_size,
@@ -25,6 +25,43 @@ def moe_align_block_size(
     )
 
 
+def moe_align_block_size(
+    topk_ids,
+    num_experts,
+    block_size,
+    pad_sorted_token_ids=False,
+):
+    max_num_tokens_padded = topk_ids.numel() + (num_experts + 1) * (block_size - 1)
+
+    sorted_ids_xpu = torch.empty(
+        (max_num_tokens_padded,), dtype=torch.int32, device=topk_ids.device
+    )
+    if not pad_sorted_token_ids:
+        sorted_ids_xpu.fill_(topk_ids.numel())
+    max_num_m_blocks = max_num_tokens_padded // block_size
+    expert_ids_xpu = torch.zeros(
+        (max_num_m_blocks,), dtype=torch.int32, device=topk_ids.device
+    )
+    num_tokens_post_pad_xpu = torch.empty(
+        (1), dtype=torch.int32, device=topk_ids.device
+    )
+    cumsum_buffer = torch.empty(
+        num_experts + 2, dtype=torch.int32, device=topk_ids.device
+    )
+    moe_align_block_size_impl(
+        topk_ids,
+        num_experts + 1,
+        block_size,
+        sorted_ids_xpu,
+        expert_ids_xpu,
+        num_tokens_post_pad_xpu,
+        cumsum_buffer,
+        pad_sorted_token_ids,
+    )
+
+    return sorted_ids_xpu, expert_ids_xpu, num_tokens_post_pad_xpu
+
+
 def topk_softmax(
     topk_weights: torch.Tensor,
     topk_ids: torch.Tensor,
@@ -219,7 +256,7 @@ def cutlass_fp4_group_mm(
     return c.to(dtype=out_dtype)
 
 
-def moe_grouped_mm_nt(activations, weights, total_rows_for_experts, n_experts):
+def moe_grouped_mm_nn(activations, weights, total_rows_for_experts, n_experts):
     """
     BF16/FP16 grouped GEMM for MoE with non-transposed weights.
     activations: (total_tokens, hidden_dim)
@@ -233,7 +270,175 @@ def moe_grouped_mm_nt(activations, weights, total_rows_for_experts, n_experts):
         device=activations.device,
         dtype=activations.dtype,
     )
-    torch.ops.sgl_kernel.moe_grouped_mm_nt(
+    torch.ops.sgl_kernel.moe_grouped_mm_nn(
         output, activations, weights, total_rows_for_experts, n_experts
     )
     return output
+
+
+def fused_experts(
+    hidden_states: torch.Tensor,
+    w1: torch.Tensor,
+    w2: torch.Tensor,
+    topk_weights: torch.Tensor,
+    topk_ids: torch.Tensor,
+    b1: Optional[torch.Tensor] = None,
+    b2: Optional[torch.Tensor] = None,
+    inplace: bool = False,
+    activation: str = "silu",
+    use_fp8_w8a8: bool = False,
+    w1_scale: Optional[torch.Tensor] = None,
+    w2_scale: Optional[torch.Tensor] = None,
+    w1_zp: Optional[torch.Tensor] = None,
+    w2_zp: Optional[torch.Tensor] = None,
+    a1_scale: Optional[torch.Tensor] = None,
+    a2_scale: Optional[torch.Tensor] = None,
+    block_shape: Optional[list[int]] = None,
+    no_combine: bool = False,
+    routed_scaling_factor: Optional[float] = None,
+    gemm1_alpha: Optional[float] = None,
+    gemm1_limit: Optional[float] = None,
+) -> torch.Tensor:
+    """
+    This function computes a Mixture of Experts (MoE) layer using two sets of
+    weights, w1 and w2, and top-k gating mechanism.
+
+    Parameters:
+    - hidden_states [num_tokens, hidden_dim] (torch.Tensor): The input tensor to the MoE layer.
+    - w1 [num_experts, hidden_dim, output_channel] (torch.Tensor): The first set of expert weights.
+    - w2 [num_experts, output_channel, hidden_dim] (torch.Tensor): The second set of expert weights.
+    - topk_weights [num_tokens, topk] (torch.Tensor): The top-k output of the experts.
+    - topk_ids [num_tokens, topk] (torch.Tensor): The top-k indices of the experts.
+    - b1 (Optional[torch.Tensor]): Optional bias for w1.
+    - b2 (Optional[torch.Tensor]): Optional bias for w2.
+    - inplace (bool): If True, perform operations in-place to save memory. Defaults to False.
+    - activation (str): The activation function to use ('silu' or 'gelu'). Defaults to 'silu'.
+    - use_fp8_w8a8 (bool): If True, use fp8 arithmetic to compute the inner
+        products for w1 and w2. Defaults to False.
+    - w1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w1.
+    - w2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        w2.
+    - a1_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a1.
+    - a2_scale (Optional[torch.Tensor]): Optional scale to be used for
+        a2.
+    - block_shape: (Optional[List[int]]): Optional block size for block-wise
+        quantization.
+    - no_combine (bool): If True, skip the combine step. Defaults to False.
+    - routed_scaling_factor (Optional[float]): Optional scaling factor for routed tokens, used by Llama4 only.
+    - gemm1_alpha (Optional[float]): Optional gemm1_alpha for the activation
+        function.
+    - gemm1_limit (Optional[float]): Optional gemm1_limit for the swiglu activation
+        function.
+
+    Returns:
+    - torch.Tensor: The output tensor after applying the MoE layer.
+    """
+
+    assert use_fp8_w8a8 is False, "current MoE does not support use_fp8_w8a8"
+    assert w1_scale is None, "current MoE does not support w1_scale"
+    assert w2_scale is None, "current MoE does not support w2_scale"
+    assert a1_scale is None, "current MoE does not support a1_scale"
+    assert a2_scale is None, "current MoE does not support a2_scale"
+    assert block_shape is None, "current MoE does not support block_shape"
+
+    # type check
+    assert hidden_states.dtype == torch.bfloat16, "hidden_states must be bfloat16"
+    assert w1.dtype == torch.bfloat16, "w1 must be bfloat16"
+    assert w2.dtype == torch.bfloat16, "w2 must be bfloat16"
+
+    # Shape check
+    assert hidden_states.ndim == 2, "hidden_states must be 2D"
+    assert (
+        hidden_states.shape[-1] == w1.shape[-2]
+    ), f"hidden_states shape[-1] {hidden_states.shape} must be equal to w1 shape[-2] {w1.shape}"
+    assert (
+        2 * w2.shape[1] == w1.shape[2]
+    ), f"w2 shape[1] {w2.shape[1]} must be half of w1 shape[2] {w1.shape[2]}"
+    assert (topk_ids.shape == topk_weights.shape) and (
+        topk_ids.shape[0] == hidden_states.shape[0]
+    ), f"topk_ids shape {topk_ids.shape} and topk_weights shape {topk_weights.shape} must be equal and match hidden_states shape[0] {hidden_states.shape[0]}"
+
+    num_tokens, _ = hidden_states.shape
+    E, K, _ = w1.shape
+    N, OutK = w2.shape[1]
+
+    M = num_tokens
+    TopK = topk_ids.shape[1]
+
+    cache = torch.empty(
+        M * TopK * max(2 * N, OutK),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache1 = cache[:, M * TopK * 2 * N].view((M, TopK, 2 * N))
+    intermediate_cache2 = torch.empty(
+        (M * TopK, N // 2),
+        device=hidden_states.device,
+        dtype=hidden_states.dtype,
+    )
+    intermediate_cache3 = cache[:, M * TopK * OutK].view((M, TopK, OutK))
+
+    if no_combine:
+        assert not inplace
+        out_hidden_states = torch.empty(
+            (num_tokens, OutK),
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+    elif inplace:
+        out_hidden_states = hidden_states
+    else:
+        out_hidden_states = torch.empty_like(hidden_states)
+
+    idxs = topk_ids.argsort()
+    counts = topk_ids.to(torch.long).bincount().cpu().numpy()
+    tokens_per_expert = counts.cumsum()
+    num_per_tok = TopK
+    token_idxs = idxs // num_per_tok
+    offset = []
+    input_A = torch.empty(
+        (num_tokens * TopK, K), device=hidden_states.device, dtype=hidden_states.dtype
+    )
+    for expert_id, end_idx in enumerate(tokens_per_expert):
+        start_idx = 0 if expert_id == 0 else tokens_per_expert[expert_id - 1]
+        offset.append(end_idx - start_idx)
+        if start_idx == end_idx:
+            continue
+        exp_token_idxs = token_idxs[start_idx:end_idx]
+        # expert_tokens = hidden_states[exp_token_idxs]
+        # grouped_input_A.append(expert_tokens)
+        input_A[start_idx:end_idx, :].copy_(hidden_states[exp_token_idxs])
+    offset = torch.tensor(offset, device="cpu", dtype=torch.int32)
+
+    torch.ops.sglang.moe_grouped_mm_nn(
+        intermediate_cache1,
+        input_A,
+        w1,
+        offset,
+    )
+
+    gate, up_ = torch.split(intermediate_cache1, N, dim=1)
+    act = torch.nn.SiLU()
+    intermediate_cache2 = act(gate) * up_
+
+    torch.ops.sglang.moe_grouped_mm_nn(
+        intermediate_cache3,
+        intermediate_cache2.contiguous(),
+        w2,
+        offset,
+    )
+    for expert_id, end_idx in enumerate(tokens_per_expert):
+        start_idx = 0 if expert_id == 0 else tokens_per_expert[expert_id - 1]
+        if start_idx == end_idx:
+            continue
+
+        exp_token_idxs = token_idxs[start_idx:end_idx]
+        expert_out = intermediate_cache3[start_idx:end_idx]
+        expert_out.mul_(topk_weights[idxs[start_idx:end_idx]])
+        out_hidden_states.scatter_reduce_(
+            0, exp_token_idxs.view(-1, 1).repeat(1, OutK), expert_out, reduce="sum"
+        )
+
+    return out_hidden_states
diff --git a/src/sycl/GroupGemm.cpp b/src/sycl/GroupGemm.cpp
@@ -171,7 +171,7 @@ struct MoERunner {
   }
 };
 
-void moe_grouped_mm_nt(
+void moe_grouped_mm_nn(
     torch::Tensor& output,
     const torch::Tensor& activations,
     const torch::Tensor& weights,
@@ -195,7 +195,7 @@ void moe_grouped_mm_nt(
       activations.scalar_type() == weights.scalar_type(), "activations and weights must have the same data type");
   TORCH_CHECK(
       activations.scalar_type() == at::ScalarType::Half || activations.scalar_type() == at::ScalarType::BFloat16,
-      "Only float16 and bfloat16 are supported in moe_grouped_mm_nt");
+      "Only float16 and bfloat16 are supported in moe_grouped_mm_nn");
 
   if (activations.scalar_type() == at::ScalarType::BFloat16) {
     auto stream = at::xpu::getCurrentXPUStream();
diff --git a/src/torch_extension_sycl.cc b/src/torch_extension_sycl.cc
@@ -63,9 +63,9 @@ TORCH_LIBRARY_FRAGMENT(sgl_kernel, m) {
   m.impl("moe_sum", torch::kXPU, &moe_sum);
 
   m.def(
-      "moe_grouped_mm_nt(Tensor output, Tensor activations, Tensor weights, Tensor total_rows_for_experts, int "
+      "moe_grouped_mm_nn(Tensor output, Tensor activations, Tensor weights, Tensor total_rows_for_experts, int "
       "n_experts) -> ()");
-  m.impl("moe_grouped_mm_nt", torch::kXPU, &moe_grouped_mm_nt);
+  m.impl("moe_grouped_mm_nn", torch::kXPU, &moe_grouped_mm_nn);
 
   //   m.def(
   //       "fp8_blockwise_scaled_mm(Tensor mat_a, Tensor mat_b, Tensor scales_a, Tensor scales_b, ScalarType out_dtype,
diff --git a/tests/test_moe_align.py b/tests/test_moe_align.py
@@ -4,7 +4,7 @@
 import torch
 import triton
 import triton.language as tl
-from sgl_kernel import moe_align_block_size, moe_sum
+from sgl_kernel import moe_align_block_size_impl, moe_sum
 
 
 def ceil_div(a, b):
@@ -180,7 +180,7 @@ def test_moe_align_block_size_compare_implementations(
     expert_ids_triton = torch.zeros_like(expert_ids_xpu)
     num_tokens_post_pad_triton = torch.empty_like(num_tokens_post_pad_xpu)
 
-    moe_align_block_size(
+    moe_align_block_size_impl(
         topk_ids,
         num_experts + 1,
         block_size,
@@ -233,6 +233,9 @@ def test_moe_align_block_size_compare_implementations(
         block_sorted_start:block_sorted_end
     ].sort()[0]
 
+    import pdb
+
+    pdb.set_trace()
     assert torch.allclose(
         selected_sorted_ids_xpu,
         selected_sorted_ids_triton,