Fixes & Simplifications for MCore KVCache QAT/QAD; Unittests; Distributed Sync of KVCache Quantizer params (#727)

realAsma · Asma Thekkumpate · web-flow · commit 81c509c643a1 · 2026-01-06T16:08:01.000-08:00
## What does this PR do? **Type of change:** Fix MCore KV Cache Quantization: Amax Device Placement Bug; Code clean up; Distributed Sync of KVCache Quantizer params; unittest expansion to hybrid models **Overview:** Fixes bugs preventing MCore KV Cache quantization from working during checkpoint restore. ### Bug Chain **Bug 1:** `is_enabled = self.weight_quantizer.is_enabled if hasattr(self, "weight_quantizer") else False` No `weight_quantizer` for KV-cache-only quant → `is_enabled=False` → metadata not saved → `modelopt_post_restore()` never called. *(Thanks to @jenchen13 )* **Bug 2:** After fixing Bug 1, `_amax` restored on CPU (via `_reset_pytorch_state_from_metadata`). Fallback `_calibrate_quantizers()` never called because `_amax` exists. **Bug 3:** Even if called, `_calibrate_quantizers()` fails — `core_attention` has no parameters → can't determine device/dtype. ### The Fix 1. Remove `is_enabled` check entirely — disabled modules may still need metadata restore. Explicitly skip `output_layer` from extra state callbacks (never quantized) 2. Set `dtype`/`device` on `core_attention` from parent Attention module, `modelopt_post_restore()` calls `self.to(device, dtype)` 3. Remove dead `_calibrate_quantizers()` code (will bring back similar logic for KV cache affine quantization) ### Previous Unit Test Was Wrong `model_test` was `mtq.quantize()`'d, not `mto.restore()`'d. Never tested actual restore path. ### Additional Fixes - Amax sync across DP/TP for KV cache quantizers - `flash_decode` auto-disabled ### Code Cleanup Removed ~100 lines of dead code. ## Testing 1. MCore KV Cache QAD with Nano V3 + Context Parallel works 2. Unit tests: hybrid models, KV+GEMM configs, correct restore workflow, backward pass validation ## Before your PR is "*Ready for review*" - **Is this change backward compatible?**: Yes - **Did you write any new necessary tests?**: Yes - **Did you add or update any necessary documentation?**: No - **Did you update Changelog?**: Yes --------- Signed-off-by: realAsma <akuriparambi@nvidia.com> Co-authored-by: Asma Thekkumpate <akuriparambi@cw-dfw-cs-001-vscode-02.cm.cluster>
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -4,6 +4,10 @@ NVIDIA Model Optimizer Changelog (Linux)
 0.41 (2026-01-19)
 ^^^^^^^^^^^^^^^^^
 
+**Bug Fixes**
+
+- Fix Megatron KV Cache quantization checkpoint restore for QAT/QAD (device placement, amax sync across DP/TP, flash_decode compatibility).
+
 **New Features**
 
 - Add support for Transformer Engine quantization for Megatron Core models.
diff --git a/modelopt/torch/quantization/model_calib.py b/modelopt/torch/quantization/model_calib.py
@@ -95,6 +95,7 @@ def sync_quantizer_amax_across_dp_ep(quantizer, parallel_state):
             quantizer.sync_amax_across_distributed_group(parallel_state.expert_model_parallel_group)
         # TODO: create sync_bias_across_distributed_group
 
+    # Step 1:Sync amax across data parallelism
     for name, module in model.named_modules():
         if isinstance(module, QuantModule):
             for child in module.children():
@@ -142,6 +143,7 @@ def sync_quantizer_amax_across_tp(
         if quantizer.axis in axes_for_sync and quantizer.amax is not None:
             quantizer.sync_amax_across_distributed_group(parallel_state.tensor_parallel_group)
 
+    # Step 2: Sync amax across relevant parallelism (such as TP / EP)
     for name, module in model.named_modules():
         if getattr(module, "_parallel_state", None) is None:
             continue
@@ -180,10 +182,20 @@ def sync_quantizer_amax_across_tp(
                 parallel_state=module.parallel_state,
             )
 
-    for name, module in model.named_modules():
+        # MOE Quantization
         if hasattr(module, "sync_moe_local_experts_amax"):
             module.sync_moe_local_experts_amax()
 
+        # KV Cache Quantization
+        if hasattr(module, "k_bmm_quantizer") and hasattr(module, "v_bmm_quantizer"):
+            # We only support KVCache quantization with scalar per-tensor states for now (NVFP4 & FP8 KV cache)
+            # So we should sync amax across DP and TP for these quantizers (DP is already synced from above)
+            for quantizer in [module.k_bmm_quantizer, module.v_bmm_quantizer]:
+                if isinstance(quantizer, TensorQuantizer) and quantizer.amax is not None:
+                    quantizer.sync_amax_across_distributed_group(
+                        module.parallel_state.tensor_parallel_group
+                    )
+
 
 @torch.no_grad()
 def mse_calibrate(
diff --git a/modelopt/torch/quantization/nn/modules/tensor_quantizer.py b/modelopt/torch/quantization/nn/modules/tensor_quantizer.py
@@ -999,7 +999,7 @@ def forward(self, inputs):
             # Check if the input tensor is contiguous
             # Non-contiguous tensors will generate incorrect FP4 quantization results
             if hasattr(inputs, "is_contiguous") and not inputs.is_contiguous():
-                inputs.data = inputs.data.contiguous()
+                inputs = inputs.contiguous()
             if self.fake_quant:
                 with same_device_as(inputs):
                     outputs = self._fake_quantize(inputs)
diff --git a/modelopt/torch/quantization/plugins/megatron.py b/modelopt/torch/quantization/plugins/megatron.py
@@ -16,6 +16,7 @@
 """Support quantization for megatron linear layers."""
 
 import logging
+import types
 import warnings
 from typing import Any
 
@@ -28,6 +29,7 @@
 from megatron.core.parallel_state import get_data_parallel_group
 from megatron.core.tensor_parallel.mappings import gather_from_sequence_parallel_region
 from megatron.core.transformer import MegatronModule
+from megatron.core.transformer.attention import Attention
 from megatron.core.transformer.utils import make_sharded_tensors_for_checkpoint
 from megatron.core.utils import get_tensor_model_parallel_group_if_none
 
@@ -38,7 +40,6 @@
 )
 from modelopt.torch.utils.distributed import ParallelState
 
-from ..model_calib import max_calibrate
 from ..nn import QuantModule, QuantModuleRegistry, TensorQuantizer
 from ..nn.modules.quant_linear import RealQuantLinear
 from ..qtensor import QTensorWrapper
@@ -98,11 +99,6 @@ def quant_module_get_extra_state(self) -> dict:
     """
     extra_state = {}
 
-    is_enabled = self.weight_quantizer.is_enabled if hasattr(self, "weight_quantizer") else False
-
-    if not is_enabled:
-        return extra_state
-
     quantizer_state = {}
     for name, module in self.named_modules():
         if isinstance(module, TensorQuantizer):
@@ -201,6 +197,19 @@ def quant_module_set_extra_state(self, state: Any):
     self.allow_post_restore = False
 
 
+def _create_incompatible_method(method_name: str):
+    """Create a method that raises an error for incompatible flash decode methods."""
+
+    def _incompatible_method(self, *args, **kwargs):
+        raise NotImplementedError(
+            f"{method_name} is not compatible with ModelOpt KV cache quantization. "
+            f"KV cache quantization requires core_attention to be called. "
+            f"Please raise an issue at https://github.com/NVIDIA/Model-Optimizer if you need this feature."
+        )
+
+    return _incompatible_method
+
+
 def megatron_replace_quant_module_hook(model: torch.nn.Module):
     """Configure Megatron-Core model quantization support.
 
@@ -211,10 +220,39 @@ def megatron_replace_quant_module_hook(model: torch.nn.Module):
     1. We change TransformerConfig to enable heterogenous distributed checkpointing.
     2. We enable all sub- QuantModule to store quantizer_state as extra_state by
        typing-matching the QuantModuleRegistry.
+    3. For Attention modules, we configure them to use core_attention path for KV cache quantization.
     """
 
+    def _configure_attention_for_kv_cache_quant(module: Attention):
+        """Configure Attention module for KV cache quantization compatibility."""
+        # Disable flash_decode if enabled - it bypasses core_attention (only called during inference)
+        if getattr(module.config, "flash_decode", False):
+            warnings.warn(
+                "flash_decode=True is incompatible with ModelOpt KV cache quantization. "
+                "Setting flash_decode=False. Flash decode bypasses core_attention during decode phase."
+            )
+            module.config.flash_decode = False
+
+        # Set dtype and device for core_attention (needed for modelopt_post_restore)
+        assert hasattr(module, "core_attention"), "Attention module must have core_attention"
+        param = next(iter(module.parameters()), None)
+        if param is not None:
+            module.core_attention.dtype = param.dtype
+            module.core_attention.device = param.device
+
+        # Patch flash_decode and flash_decode_and_prefill to raise errors
+        module.flash_decode = types.MethodType(_create_incompatible_method("flash_decode"), module)
+        module.flash_decode_and_prefill = types.MethodType(
+            _create_incompatible_method("flash_decode_and_prefill"), module
+        )
+
     def _register_extra_state_callbacks(model: torch.nn.Module):
         for name, module in model.named_modules():
+            if name.endswith("output_layer"):
+                # output_layer is not quantized,
+                # hence we don't need to register extra state callbacks for it
+                continue
+
             if type(module) in QuantModuleRegistry:
                 # This module will be replaced as a QuantModule
                 register_modelopt_extra_state_callbacks(
@@ -223,6 +261,10 @@ def _register_extra_state_callbacks(model: torch.nn.Module):
                     quant_module_set_extra_state,
                 )
 
+            # Configure Attention modules for KV cache quantization
+            if isinstance(module, Attention):
+                _configure_attention_for_kv_cache_quant(module)
+
     for name, module in model.named_modules():
         if isinstance(module, MegatronModule):
             if "vision_model" not in name:
@@ -632,152 +674,44 @@ def _setup(self):
             self.k_bmm_quantizer = TensorQuantizer()
             self.v_bmm_quantizer = TensorQuantizer()
 
-        def _calibrate_quantizers(self):
-            """Calibrate quantizers with minimal dummy tensors."""
-            # Get device and dtype from the parent module's parameters
-            param = next(iter(self.parameters()), None)
-            device = param.device if param is not None else torch.device("cuda")
-            dtype = param.dtype if param is not None else torch.float16
-
-            # TEDotProductAttention expects format 'sbhd' or 'bshd' depending on rope_fusion
-            batch_size = 1
-            seq_len = 1
-
-            # Get dimensions from config
-            num_heads = self.config.num_attention_heads
-            head_dim = (
-                self.config.kv_channels
-                if hasattr(self.config, "kv_channels")
-                else self.config.hidden_size // num_heads
+            # Set parallel_state for distributed sync of BMM quantizers
+            try:
+                data_parallel_group = get_data_parallel_group(with_context_parallel=True)
+            except AssertionError:
+                data_parallel_group = get_data_parallel_group()
+            self.parallel_state = ParallelState(
+                data_parallel_group,
+                mcore_parallel.get_tensor_model_parallel_group(),
             )
 
-            # Determine tensor format (default to sbhd if not specified)
-            apply_rope_fusion = getattr(self.config, "apply_rope_fusion", False)
-            qkv_format = "bshd" if apply_rope_fusion else "sbhd"
-
-            if qkv_format == "sbhd":
-                dummy_tensor = torch.randn(
-                    seq_len, batch_size, num_heads, head_dim, device=device, dtype=dtype
-                )
-            else:
-                dummy_tensor = torch.randn(
-                    batch_size, seq_len, num_heads, head_dim, device=device, dtype=dtype
-                )
-
-            # Calibrate each quantizer
-            quantizers = [
-                ("q_bmm_quantizer", self.q_bmm_quantizer),
-                ("k_bmm_quantizer", self.k_bmm_quantizer),
-                ("v_bmm_quantizer", self.v_bmm_quantizer),
-            ]
-
-            for _, quantizer in quantizers:
-                if quantizer is not None and quantizer.is_enabled():
-                    if not hasattr(quantizer, "_amax") or quantizer._amax is None:
-                        quantizer.reset_amax()
-                        max_calibrate(quantizer, lambda q: q(dummy_tensor), distributed_sync=False)
-
         def forward(self, query, key, value, *args, **kwargs):
-            """Apply post-RoPE quantization to KV cache.
-
-            TEDotProductAttention receives Q, K, V after RoPE is applied,
-            so we quantize them directly for KV cache quantization.
-            """
+            """Apply post-RoPE quantization to KV cache."""
             # Quantize Q, K, V
             query = self.q_bmm_quantizer(query)
             key = self.k_bmm_quantizer(key)
             value = self.v_bmm_quantizer(value)
-
             return super().forward(query, key, value, *args, **kwargs)
 
-        def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
-            """Create a sharded state dictionary for distributed checkpointing."""
-            sharded_state_dict = {}
-
-            # First add non-quantizer parameters
-            for k, v in self.state_dict(prefix="", keep_vars=True).items():
-                if isinstance(v, torch.Tensor) and v is not None and "_quantizer" not in k:
-                    sharded_state_dict[prefix + k] = v
-
-            # Process _amax in bmm_quantizers
-            for name, quantizer in [
-                ("q_bmm_quantizer", self.q_bmm_quantizer),
-                ("k_bmm_quantizer", self.k_bmm_quantizer),
-                ("v_bmm_quantizer", self.v_bmm_quantizer),
-            ]:
-                if hasattr(quantizer, "_amax") and quantizer._amax is not None:
-                    amax_key = f"{prefix}{name}._amax"
-                    sharded_state_dict[amax_key] = quantizer._amax
-
-            # Process other quantizer parameters in bmm_quantizers
-            quantizer_state_dict = {
-                k: v
-                for k, v in self.state_dict(prefix="", keep_vars=True).items()
-                if isinstance(v, torch.Tensor) and "_quantizer" in k and "_amax" not in k
-            }
-
-            if quantizer_state_dict:
-                sharded_state_dict.update(
-                    **make_sharded_tensors_for_checkpoint(
-                        quantizer_state_dict, prefix, {}, sharded_offsets
-                    )
-                )
-
-            return sharded_state_dict
-
-        def _load_from_state_dict(self, state_dict, prefix, *args, **kwargs):
-            """Handle loading state dict for quantizers."""
-            for quantizer_name in ["q_bmm_quantizer", "k_bmm_quantizer", "v_bmm_quantizer"]:
-                full_prefix = f"{prefix}{quantizer_name}."
-                amax_key = f"{prefix}{quantizer_name}._amax"
-
-                # If amax is in state_dict, rename it to the format expected by TensorQuantizer
-                if amax_key in state_dict:
-                    expected_amax_key = f"{full_prefix}_amax"
-                    state_dict[expected_amax_key] = state_dict.pop(amax_key)
-
-            # Handle other quantizer states
-            for k in list(state_dict.keys()):
-                if "_quantizer" in k and "_amax" not in k:
-                    name = k.split(prefix)[-1] if prefix else k
-                    if name in self.state_dict():
-                        state_dict[k] = state_dict[k].view_as(self.state_dict()[name])
-
-            super()._load_from_state_dict(state_dict, prefix, *args, **kwargs)
-
         def modelopt_post_restore(self, name=""):
             """Restore quantizer states after model loading."""
-            super().modelopt_post_restore(name)
-
-            def _check_unsupported_states(quantizer):
-                """Check for unsupported quantizer states and warn if found."""
-                if not hasattr(quantizer, "state_dict"):
-                    return
-
-                for k in quantizer.state_dict():
-                    if k not in ["_amax", "_pre_quant_scale"]:
-                        warnings.warn(
-                            f"Restore of {k} for {name} is not supported. The restore of this layer might be "
-                            f"incorrect. Please implement a custom restore for {k}."
-                        )
-
-            calibration_needed = False
-
-            for quantizer_name, quantizer in [
-                ("q_bmm_quantizer", self.q_bmm_quantizer),
-                ("k_bmm_quantizer", self.k_bmm_quantizer),
-                ("v_bmm_quantizer", self.v_bmm_quantizer),
-            ]:
-                if not hasattr(self, quantizer_name) or not quantizer.is_enabled():
-                    continue
-
-                _check_unsupported_states(quantizer)
-
-                if not hasattr(quantizer, "_amax") or quantizer._amax is None:
-                    calibration_needed = True
+            for tq in [self.q_bmm_quantizer, self.k_bmm_quantizer, self.v_bmm_quantizer]:
+                # TODO: Add support for non-scalar states such as
+                # Affine KVCache  bias vector which is per head per channel
+                if not all(v.numel() == 1 for v in tq.state_dict().values()):
+                    raise NotImplementedError(
+                        "Only scalar states are supported for KV Cache/BMM Quantizers"
+                    )
+            # dtype and device should have been set in `megatron_replace_quant_module_hook`
+            # via `_configure_attention_for_kv_cache_quant`
+            assert hasattr(self, "device") and hasattr(self, "dtype")
+            self.to(device=self.device, dtype=self.dtype)
 
-            if calibration_needed:
-                self._calibrate_quantizers()
+        def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
+            # Currently we do not need sharded_state_dict for TEDotProductAttention since the amax are scalar values.
+            # However we would need this in future to support non-scalar states such as
+            # Affine KVCache Quant bias vector.
+            state_dict = self.state_dict(prefix="", keep_vars=True)
+            return make_sharded_tensors_for_checkpoint(state_dict, prefix, {}, sharded_offsets)
 
 
 @QuantModuleRegistry.register({megatron_moe_layer.MoELayer: "megatron_moe_MoELayer"})
diff --git a/tests/_test_utils/torch/megatron/models.py b/tests/_test_utils/torch/megatron/models.py
@@ -87,7 +87,7 @@ def __init__(
             init_method=torch.nn.init.xavier_uniform_,
             bias=True,
             gather_output=False,
-            skip_bias_add=True,
+            skip_bias_add=False,
             is_expert=False,
             tp_group=tp_group,
         )
@@ -101,7 +101,7 @@ def __init__(
             config=config,
             init_method=torch.nn.init.xavier_uniform_,
             bias=True,
-            skip_bias_add=True,
+            skip_bias_add=False,
             input_is_parallel=True,
             is_expert=False,
             tp_group=tp_group,
@@ -311,6 +311,7 @@ def get_mcore_mamba_hybrid_model(
     max_sequence_length: int = 4,
     vocab_size: int = 64,
     bf16: bool = True,
+    sequence_parallel: bool = False,
     # Mamba-specific parameters
     mamba_state_dim: int = 32,
     mamba_head_dim: int = 16,
@@ -337,7 +338,7 @@ def get_mcore_mamba_hybrid_model(
     config = TransformerConfig(
         tensor_model_parallel_size=tensor_model_parallel_size,
         pipeline_model_parallel_size=pipeline_model_parallel_size,
-        sequence_parallel=False,
+        sequence_parallel=sequence_parallel,
         num_layers=num_layers,
         num_layers_in_first_pipeline_stage=num_layers_in_first_pipeline_stage,
         num_layers_in_last_pipeline_stage=num_layers_in_last_pipeline_stage,
diff --git a/tests/_test_utils/torch/megatron/utils.py b/tests/_test_utils/torch/megatron/utils.py
@@ -214,6 +214,16 @@ def convert_maybe_fp8(v):
         f"diff: {logits_diff.max()} ref: {logits_ref}, test: {logits_test}"
     )
 
+    # Test backward pass on model_test
+    model_test.train()
+    loss = forward_fn(model_test).sum()
+    loss.backward()
+
+    # Assert that trainable parameters have gradients computed
+    for name, param in model_test.named_parameters():
+        if param.requires_grad:
+            assert param.grad is not None, f"Parameter {name} has no gradient computed"
+
 
 def copy_weights_from_grouped_to_non_grouped(te_grouped_moe_model, sequential_moe_model):
     """Copy weights from TEGrouped MoE model to sequential MoE model."""
diff --git a/tests/_test_utils/torch/quantization/quantize_common.py b/tests/_test_utils/torch/quantization/quantize_common.py
diff --git a/tests/gpu/torch/quantization/plugins/test_megatron.py b/tests/gpu/torch/quantization/plugins/test_megatron.py