diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index a2af9e011..aec2bbee1 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -93,6 +93,7 @@ is_fp8_linear, is_fp8_model, is_hpex_available, + is_moe_model, llm_load_model, memory_monitor, mv_module_from_gpu, @@ -390,6 +391,11 @@ def __init__( ) # Automatically adjust the disable_opt_rtn option if the user does not explicitly set it. + # To avoid None issue, we keep a copy though it's a little ugly + self.orig_disable_opt_rtn = disable_opt_rtn + if self.iters != 0 and self.orig_disable_opt_rtn is not None: + logger.warning("`disable_opt_rtn` only works when `iters` is set to 0, ignore it now.") + disable_opt_rtn = True if ( self.bits >= 8 and self.act_bits >= 16 @@ -397,12 +403,16 @@ def __init__( and self.data_type == "int" and disable_opt_rtn is None ): - logger.warning("For INT8 RTN quantization, set `--disable_opt_rtn` as default.") + logger.warning("`disable_opt_rtn` is turned on for W8A16 quantization to improve efficiency.") disable_opt_rtn = True - if disable_opt_rtn is None: - if self.iters == 0: - logger.info("For the most RTN cases, set `--disable_opt_rtn` to False as default.") - disable_otp_rtn = False + if disable_opt_rtn is None and self.iters == 0: + logger.info( + "`enable_opt_rtn` is turned on, set `--disable_opt_rtn` for higher speed at the cost of accuracy." + ) + disable_opt_rtn = False + + # Important Note! This is not very robust, do NOT rely on it to do high risky thing + self.is_moe_model = is_moe_model(self.model) self.minmax_lr = minmax_lr or self.lr self.enable_alg_ext = enable_alg_ext @@ -1105,6 +1115,20 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T m.zp = None else: try: + disable_opt_rtn = self.disable_opt_rtn + if ( + not disable_opt_rtn + and self.orig_disable_opt_rtn is None + and self.is_moe_model + and "expert" in m.tmp_name + and "shared_expert" not in m.tmp_name + and self.super_bits is None # GGUF still uses the optimized RTN for MoE layers + ): + disable_opt_rtn = True + logger.warning_once( + "MoE layer detected: optimized RTN is disabled for efficiency. " + "Use `--enable_opt_rtn` to force-enable it for MoE layers." + ) m = m.to(tuning_device) m = WrapperLinear( m, @@ -1113,7 +1137,7 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T enable_norm_bias_tuning=False, enable_round_tuning=False, enable_torch_compile=self.enable_torch_compile, - disable_opt_rtn=self.disable_opt_rtn, + disable_opt_rtn=disable_opt_rtn, ) m = m.unwrapper({}) except torch.OutOfMemoryError: diff --git a/auto_round/compressors/config.py b/auto_round/compressors/config.py index f7e0d92f2..dd028b6cd 100644 --- a/auto_round/compressors/config.py +++ b/auto_round/compressors/config.py @@ -247,7 +247,7 @@ def is_default(self): @dataclass class TuningExtraConfig(BaseExtraConfig): amp: bool = True - disable_opt_rtn: bool | None = True + disable_opt_rtn: bool | None = None enable_alg_ext: bool = False enable_minmax_tuning: bool = True enable_norm_bias_tuning: bool = False diff --git a/auto_round/utils/device.py b/auto_round/utils/device.py index e19fbca68..8fbcc09fd 100644 --- a/auto_round/utils/device.py +++ b/auto_round/utils/device.py @@ -832,10 +832,10 @@ def get_moe_memory_ratio(block: torch.nn.Module) -> float: - Mixtral (2/8 experts): returns 0.25 - Qwen2MoE (4/60 experts): returns ~0.067 """ - from auto_round.utils.model import is_moe + from auto_round.utils.model import is_moe_layer for name, module in block.named_modules(): - if not is_moe(module): + if not is_moe_layer(module): continue config = getattr(block, "config", None) @@ -898,7 +898,7 @@ def estimate_tuning_block_mem( - additional_memory (float): Additional memory overhead (in GB) for operations like attention. """ # Calculate all block parameters memory and build layer-wise memory dict - from auto_round.utils.model import get_layer_features, is_moe + from auto_round.utils.model import get_layer_features, is_moe_layer layer_memory_dict = {} diff --git a/auto_round/utils/model.py b/auto_round/utils/model.py index 8b85bc26e..f4c6e7b75 100644 --- a/auto_round/utils/model.py +++ b/auto_round/utils/model.py @@ -598,9 +598,9 @@ def is_diffusion_model(model_or_path: Union[str, object]) -> bool: return False -def is_moe(module: torch.nn.Module) -> bool: +def is_moe_layer(module: torch.nn.Module) -> bool: """Returns whether the module is an MOE layer.""" - return any( + return "moe" in type(module).__name__.lower() or any( key in type(module).__name__.lower() for key in [ "MixtralSparseMoeBlock".lower(), @@ -1191,6 +1191,17 @@ def mv_module_from_gpu(module): return module.to("cpu") +def is_moe_model(model: torch.nn.Module) -> bool: + if hasattr(model, "config"): + for key in model.config.to_dict().keys(): + if "moe" in key or "expert" in key: + return True + for n, m in model.named_modules(): + if "expert" in n: + return True + return False + + def to_dtype(input, dtype=torch.float32): """Moves input data to the specified data type. @@ -1281,7 +1292,7 @@ def set_amax_for_all_moe_layers(model: torch.nn.Module, layer_name=None, attr_na model = get_module(model, moe_name) # Handle input quantizers of experts that are not calibrated for name, sub_module in model.named_modules(): - if not (is_moe(sub_module) and hasattr(sub_module, "experts")): + if not (is_moe_layer(sub_module) and hasattr(sub_module, "experts")): continue expert_linear_names = get_expert_linear_names(sub_module) # Get input projection names for FP8 dispatch unification