lingebeng
diff --git a/‎.buildkite/test-pipeline.yaml‎
Lines changed: 40 additions & 0 deletions b/‎.buildkite/test-pipeline.yaml‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎benchmarks/kernels/benchmark_cutlass_moe_fp8.py‎
Lines changed: 7 additions & 5 deletions b/‎benchmarks/kernels/benchmark_cutlass_moe_fp8.py‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py‎
Lines changed: 3 additions & 4 deletions b/‎benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎benchmarks/kernels/benchmark_grouped_gemm_cutlass.py‎
Lines changed: 13 additions & 12 deletions b/‎benchmarks/kernels/benchmark_grouped_gemm_cutlass.py‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎benchmarks/kernels/benchmark_moe.py‎
Lines changed: 29 additions & 1 deletion b/‎benchmarks/kernels/benchmark_moe.py‎
Lines changed: 29 additions & 1 deletion
diff --git a/‎docs/design/moe_kernel_features.md‎
Lines changed: 2 additions & 2 deletions b/‎docs/design/moe_kernel_features.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tests/compile/test_fusion_attn.py‎
Lines changed: 3 additions & 3 deletions b/‎tests/compile/test_fusion_attn.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/compile/test_silu_mul_quant_fusion.py‎
Lines changed: 3 additions & 3 deletions b/‎tests/compile/test_silu_mul_quant_fusion.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/evals/gsm8k/configs/moe-refactor-dp-ep/Llama-4-Scout-Fp8-ModelOpt-triton.yaml‎
Lines changed: 3 additions & 0 deletions b/‎tests/evals/gsm8k/configs/moe-refactor-dp-ep/Llama-4-Scout-Fp8-ModelOpt-triton.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml‎
Lines changed: 0 additions & 1 deletion b/‎tests/evals/gsm8k/configs/moe-refactor-dp-ep/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm-deepep-ll.yaml‎
Lines changed: 0 additions & 1 deletion
@@ -634,6 +634,46 @@ steps:
     - pip install helion
     - pytest -v -s kernels/helion/
 
+  
+- label: Kernels FP8 MoE Test (1 H100)
+  timeout_in_minutes: 90
+  gpu: h100
+  num_gpus: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutlass_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer.py
+    - pytest -v -s kernels/moe/test_gpt_oss_triton_kernels.py
+    - pytest -v -s kernels/moe/test_modular_oai_triton_moe.py
+    - pytest -v -s kernels/moe/test_moe.py
+    # - pytest -v -s kernels/moe/test_block_fp8.py - failing on main
+    - pytest -v -s kernels/moe/test_block_int8.py
+    - pytest -v -s kernels/moe/test_triton_moe_no_act_mul.py
+    - pytest -v -s kernels/moe/test_triton_moe_ptpc_fp8.py
+
+- label: Kernels FP8 MoE Test (2 H100s)
+  timeout_in_minutes: 90
+  gpu: h100
+  num_gpus: 2
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_deepep_deepgemm_moe.py
+    - pytest -v -s kernels/moe/test_deepep_moe.py
+    - pytest -v -s kernels/moe/test_pplx_cutlass_moe.py
+    # - pytest -v -s kernels/moe/test_pplx_moe.py - failing on main
+  
+- label: Kernels Fp4 MoE Test (B200)
+  timeout_in_minutes: 60
+  gpu: b200
+  num_gpus: 1
+  optional: true
+  commands:
+    - pytest -v -s kernels/moe/test_cutedsl_moe.py
+    - pytest -v -s kernels/moe/test_flashinfer_moe.py
+    - pytest -v -s kernels/moe/test_nvfp4_moe.py
+    - pytest -v -s kernels/moe/test_ocp_mx_moe.py
+
+
 - label: Model Executor Test # 23min
   timeout_in_minutes: 35
   torch_nightly: true
 
@@ -9,6 +9,7 @@
 import torch
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
 from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8
@@ -138,12 +139,13 @@ def bench_run(
     fn = mk.FusedMoEModularKernel(
         MoEPrepareAndFinalizeNoEP(),
         CutlassExpertsFp8(
-            out_dtype=a.dtype,
-            e=num_experts,
-            n=n,
-            k=k,
+            moe_config=make_dummy_moe_config(
+                num_experts=num_experts,
+                hidden_dim=k,
+                intermediate_size_per_partition=n,
+                in_dtype=a.dtype,
+            ),
             quant_config=quant_config,
-            device=w1.device,
         ),
     )
 
 
@@ -12,6 +12,7 @@
 import torch.utils.benchmark as benchmark
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import (
@@ -198,8 +199,7 @@ def run_cutlass_moe_fp4(
         kernel = mk.FusedMoEModularKernel(
             MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
             CutlassExpertsFp4(
-                out_dtype=dtype,
-                max_experts_per_worker=e,
+                make_dummy_moe_config(),
                 quant_config=quant_config,
             ),
         )
@@ -244,8 +244,7 @@ def run_cutlass_from_graph(
         kernel = mk.FusedMoEModularKernel(
             MoEPrepareAndFinalizeNoEP(defer_input_quant=True),
             CutlassExpertsFp4(
-                out_dtype=dtype,
-                max_experts_per_worker=e,
+                make_dummy_moe_config(),
                 quant_config=quant_config,
             ),
         )
 
@@ -6,6 +6,7 @@
 from benchmark_shapes import WEIGHT_SHAPES_MOE
 
 import vllm.model_executor.layers.fused_moe.modular_kernel as mk
+from tests.kernels.moe.utils import make_dummy_moe_config
 from vllm import _custom_ops as ops
 from vllm.config import ParallelConfig, VllmConfig, set_current_vllm_config
 from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config
@@ -134,13 +135,13 @@ def run_cutlass_moe(
         fn = mk.FusedMoEModularKernel(
             MoEPrepareAndFinalizeNoEP(),
             CutlassExpertsFp8(
-                out_dtype=a.dtype,
-                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
-                e=w2.shape[0],
-                n=w2.shape[2],
-                k=w2.shape[1],
+                moe_config=make_dummy_moe_config(
+                    num_experts=w2.shape[0],
+                    hidden_dim=w2.shape[1],
+                    intermediate_size_per_partition=w2.shape[2],
+                    in_dtype=a.dtype,
+                ),
                 quant_config=quant_config,
-                device=w1.device,
             ),
         )
 
@@ -166,13 +167,13 @@ def run_cutlass_from_graph(
         fn = mk.FusedMoEModularKernel(
             MoEPrepareAndFinalizeNoEP(),
             CutlassExpertsFp8(
-                out_dtype=a.dtype,
-                # NOTE(rob): w2 is shaped as [E, hidden, intermediate]
-                e=w2.shape[0],
-                n=w2.shape[2],
-                k=w2.shape[1],
+                moe_config=make_dummy_moe_config(
+                    num_experts=w2.shape[0],
+                    hidden_dim=w2.shape[1],
+                    intermediate_size_per_partition=w2.shape[2],
+                    in_dtype=a.dtype,
+                ),
                 quant_config=quant_config,
-                device=w1.device,
             ),
         )
 
 
@@ -16,10 +16,16 @@
 from ray.experimental.tqdm_ray import tqdm
 
 from vllm.model_executor.layers.fused_moe.config import (
+    FusedMoEConfig,
+    FusedMoEParallelConfig,
     FusedMoEQuantConfig,
+    RoutingMethodType,
     _get_config_dtype_str,
 )
 from vllm.model_executor.layers.fused_moe.fused_moe import *
+from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
+    TritonOrDeepGemmExperts,
+)
 from vllm.platforms import current_platform
 from vllm.transformers_utils.config import get_config
 from vllm.triton_utils import triton
@@ -194,10 +200,33 @@ def run():
             block_shape=block_quant_shape,
         )
 
+        deep_gemm_experts = mk.FusedMoEModularKernel(
+            prepare_finalize=MoEPrepareAndFinalizeNoEP(),
+            fused_experts=TritonOrDeepGemmExperts(
+                moe_config=FusedMoEConfig(
+                    num_experts=num_experts,
+                    experts_per_token=topk,
+                    hidden_dim=hidden_size,
+                    intermediate_size_per_partition=shard_intermediate_size,
+                    num_local_experts=num_experts,
+                    activation="silu",
+                    parallel_config=FusedMoEParallelConfig.make_no_parallel(),
+                    in_dtype=init_dtype,
+                    routing_method=RoutingMethodType.TopK,
+                ),
+                quant_config=quant_config,
+            ),
+        )
+
         with override_config(config):
             topk_weights, topk_ids, token_expert_indices = fused_topk(
                 x, input_gating, topk, renormalize=not use_deep_gemm
             )
+
+            if use_deep_gemm:
+                return deep_gemm_experts(
+                    x, w1, w2, topk_weights, topk_ids, inplace=True
+                )
             return fused_experts(
                 x,
                 w1,
@@ -206,7 +235,6 @@ def run():
                 topk_ids,
                 inplace=True,
                 quant_config=quant_config,
-                allow_deep_gemm=use_deep_gemm,
             )
 
     # JIT compilation & warmup
 
@@ -85,10 +85,10 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels
 |--------|-------------------|--------------|---------------|---------------------|-----------------------|---------|--------|
 | triton | standard | all<sup>1</sup> | G,A,T | silu, gelu,</br>swigluoai,</br>silu_no_mul,</br>gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],</br>[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] |
 | triton (batched) | batched | all<sup>1</sup> | G,A,T | silu, gelu | <sup>6</sup> | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] |
-| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`deep_gemm_moe_fp8`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.deep_gemm_moe_fp8],</br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
+| deep gemm | standard,</br>batched | fp8 | G(128),A,T | silu, gelu | <sup>6</sup> | Y | </br>[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.deep_gemm_moe.DeepGemmExperts],</br>[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe.BatchedDeepGemmExperts] |
 | cutlass_fp4 | standard,</br>batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] |
 | cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
-| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
+| flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
 | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
 | marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
 | trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
 
@@ -43,7 +43,7 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     QuantKey,
     kFp8StaticTensorSym,
-    kNvfp4Quant,
+    kNvfp4Dynamic,
 )
 from vllm.platforms import current_platform
 from vllm.utils.flashinfer import has_flashinfer
@@ -215,7 +215,7 @@ def forward(self, q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
 class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
     """Test model for AttentionNvfp4QuantPattern fusion."""
 
-    quant_key = kNvfp4Quant
+    quant_key = kNvfp4Dynamic
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -468,7 +468,7 @@ def test_attention_quant_pattern(
 
     # Note: for fp8, fully_replaced=False because query quant ops remain in graph.
     # Only output quant ops are fused into attention.
-    test_backend.check_before_ops([quant_op], fully_replaced=quant_key is kNvfp4Quant)
+    test_backend.check_before_ops([quant_op], fully_replaced=quant_key is kNvfp4Dynamic)
 
     # access the underlying `AttnFusionPass` on the `LazyInitPass`
     assert attn_pass.pass_.matched_count == sum(attn_fusion_supported)
 
@@ -44,7 +44,7 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape,
     kFp8StaticTensorSym,
-    kNvfp4Quant,
+    kNvfp4Dynamic,
 )
 from vllm.platforms import current_platform
 
@@ -134,11 +134,11 @@ def forward(self, x):
     def ops_in_model_before(self):
         return [
             SILU_MUL_OP if self.enable_silu_mul_custom_op else torch.ops.aten.mul,
-            QUANT_OPS[kNvfp4Quant],
+            QUANT_OPS[kNvfp4Dynamic],
         ]
 
     def ops_in_model_after(self):
-        return [FUSED_OPS[kNvfp4Quant]]
+        return [FUSED_OPS[kNvfp4Dynamic]]
 
 
 class TestSiluMulGroupFp8QuantModel(torch.nn.Module):
 
@@ -3,3 +3,6 @@ accuracy_threshold: 0.92
 num_questions: 1319
 num_fewshot: 5
 server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel"
+env:
+  VLLM_USE_FLASHINFER_MOE_FP8: "0"
+  VLLM_USE_DEEP_GEMM: "0"
@@ -6,4 +6,3 @@ server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enab
 env:
   VLLM_USE_DEEP_GEMM: "1"
   VLLM_USE_DEEP_GEMM_MOE: "1"
-  VLLM_USE_DEEP_GEMM_E8M0: "0"