intel · wenhuach21 · Jan 9, 2026 · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -93,6 +93,7 @@
     is_fp8_linear,
     is_fp8_model,
     is_hpex_available,
+    is_moe_model,
     llm_load_model,
     memory_monitor,
     mv_module_from_gpu,
@@ -390,19 +391,28 @@ def __init__(
             )
 
         # Automatically adjust the disable_opt_rtn option if the user does not explicitly set it.
+        # To avoid None issue, we keep a copy though it's a little ugly
+        self.orig_disable_opt_rtn = disable_opt_rtn
+        if self.iters != 0 and self.orig_disable_opt_rtn is not None:
+            logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.")
+            disable_opt_rtn = True
         if (
             self.bits >= 8
             and self.act_bits >= 16
             and self.iters == 0
             and self.data_type == "int"
             and disable_opt_rtn is None
         ):
-            logger.warning("For INT8 RTN quantization, set `--disable_opt_rtn` as default.")
+            logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.")
             disable_opt_rtn = True
-        if disable_opt_rtn is None:
-            if self.iters == 0:
-                logger.info("For the most RTN cases, set `--disable_opt_rtn` to False as default.")
-            disable_otp_rtn = False
+        if disable_opt_rtn is None and self.iters == 0:
+            logger.info(
+                "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy."
+            )
+            disable_opt_rtn = False
+
+        # Important Note! This is not very robust, do NOT rely on it to do high risky thing
+        self.is_moe_model = is_moe_model(self.model)
 
         self.minmax_lr = minmax_lr or self.lr
         self.enable_alg_ext = enable_alg_ext
@@ -1105,6 +1115,20 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T
             m.zp = None
         else:
             try:
+                disable_opt_rtn = self.disable_opt_rtn
+                if (
+                    not disable_opt_rtn
+                    and self.orig_disable_opt_rtn is None
+                    and self.is_moe_model
+                    and "expert" in m.tmp_name
+                    and "shared_expert" not in m.tmp_name
+                    and self.super_bits is None  # GGUF still uses the optimized RTN for MoE layers
+                ):
+                    disable_opt_rtn = True
+                    logger.warning_once(
+                        "MoE layer detected: optimized RTN is disabled for efficiency. "
+                        "Use `--enable_opt_rtn` to force-enable it for MoE layers."
+                    )
                 m = m.to(tuning_device)
                 m = WrapperLinear(
                     m,
@@ -1113,7 +1137,7 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T
                     enable_norm_bias_tuning=False,
                     enable_round_tuning=False,
                     enable_torch_compile=self.enable_torch_compile,
-                    disable_opt_rtn=self.disable_opt_rtn,
+                    disable_opt_rtn=disable_opt_rtn,
                 )
                 m = m.unwrapper({})
             except torch.OutOfMemoryError:

diff --git a/auto_round/compressors/config.py b/auto_round/compressors/config.py
@@ -247,7 +247,7 @@ def is_default(self):
 @dataclass
 class TuningExtraConfig(BaseExtraConfig):
     amp: bool = True
-    disable_opt_rtn: bool | None = True
+    disable_opt_rtn: bool | None = None
     enable_alg_ext: bool = False
     enable_minmax_tuning: bool = True
     enable_norm_bias_tuning: bool = False

diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py
@@ -832,10 +832,10 @@ def get_moe_memory_ratio(block: torch.nn.Module) -> float:
         - Mixtral (2/8 experts): returns 0.25
         - Qwen2MoE (4/60 experts): returns ~0.067
     """
-    from auto_round.utils.model import is_moe
+    from auto_round.utils.model import is_moe_layer
 
     for name, module in block.named_modules():
-        if not is_moe(module):
+        if not is_moe_layer(module):
             continue
 
         config = getattr(block, "config", None)
@@ -898,7 +898,7 @@ def estimate_tuning_block_mem(
             - additional_memory (float): Additional memory overhead (in GB) for operations like attention.
     """
     # Calculate all block parameters memory and build layer-wise memory dict
-    from auto_round.utils.model import get_layer_features, is_moe
+    from auto_round.utils.model import get_layer_features, is_moe_layer
 
     layer_memory_dict = {}
 

diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py
@@ -598,9 +598,9 @@ def is_diffusion_model(model_or_path: Union[str, object]) -> bool:
         return False
 
 
-def is_moe(module: torch.nn.Module) -> bool:
+def is_moe_layer(module: torch.nn.Module) -> bool:
     """Returns whether the module is an MOE layer."""
-    return any(
+    return "moe" in type(module).__name__.lower() or any(
         key in type(module).__name__.lower()
         for key in [
             "MixtralSparseMoeBlock".lower(),
@@ -1191,6 +1191,17 @@ def mv_module_from_gpu(module):
         return module.to("cpu")
 
 
+def is_moe_model(model: torch.nn.Module) -> bool:
+    if hasattr(model, "config"):
+        for key in model.config.to_dict().keys():
+            if "moe" in key or "expert" in key:
+                return True
+    for n, m in model.named_modules():
+        if "expert" in n:
+            return True
+    return False
+
+
 def to_dtype(input, dtype=torch.float32):
     """Moves input data to the specified data type.
 
@@ -1281,7 +1292,7 @@ def set_amax_for_all_moe_layers(model: torch.nn.Module, layer_name=None, attr_na
         model = get_module(model, moe_name)
     # Handle input quantizers of experts that are not calibrated
     for name, sub_module in model.named_modules():
-        if not (is_moe(sub_module) and hasattr(sub_module, "experts")):
+        if not (is_moe_layer(sub_module) and hasattr(sub_module, "experts")):
             continue
         expert_linear_names = get_expert_linear_names(sub_module)
         # Get input projection names for FP8 dispatch unification