diff --git a/README.md b/README.md index f24f28ddf..335a8f33f 100644 --- a/README.md +++ b/README.md @@ -195,7 +195,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round") ##### Algorithm Settings - **`enable_alg_ext` (bool)**: [Experimental Feature] Only for `iters>0`. Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`. -- **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `False` (improved RTN enabled). +- **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `None` (improved RTN enabled). ##### Tuning Process Parameters - **`iters` (int)**: Number of tuning iterations (default is `200`). Common values: 0 (RTN mode), 50 (with lr=5e-3 recommended), 1000. Higher values increase accuracy but slow down tuning. diff --git a/auto_round/__main__.py b/auto_round/__main__.py index adbd8fe02..6d95b4864 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -255,7 +255,9 @@ def __init__(self, *args, **kwargs): ) tuning.add_argument( "--disable_opt_rtn", - action="store_true", + "--disable-opt-rtn", + action=argparse.BooleanOptionalAction, + default=None, help="Disable optimization for RTN (Round-To-Nearest) mode when iters=0. " "RTN is fast but less accurate; keeping optimization enabled is recommended.", ) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 050281983..51c850c16 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import annotations -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Optional, Union import torch @@ -85,7 +85,7 @@ def __new__( enable_adam: bool = False, extra_config: ExtraConfig = None, enable_alg_ext: bool = None, - disable_opt_rtn: bool = None, + disable_opt_rtn: Optional[bool] = None, low_cpu_mem_usage: bool = False, **kwargs, ) -> BaseCompressor: @@ -112,7 +112,7 @@ def __new__( enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2) for better accuracy. Defaults to False. disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0) for fast quatnziation - with lower accuracy. Defaults to False. + with lower accuracy. Defaults to None. low_cpu_mem_usage (bool, optional): Lower CPU memory mode. Defaults to False. bits (int, optional): Weight quantization bits. Defaults to 4. @@ -291,7 +291,7 @@ class AutoRoundLLM(LLMCompressor): act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True. enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False. device_map (str | dict, optional): Device placement map. Defaults to None. - disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False. + disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to None. enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False. **kwargs: Backward compatible options: - enable_alg_ext, quant_lm_head, lr, lr_scheduler, sampler, not_use_best_mse, dynamic_max_gap, diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 2665d74ef..cd9fc516c 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -189,7 +189,7 @@ def __init__( device_map: Union[str, torch.device, int, dict] = 0, enable_torch_compile: bool = False, enable_alg_ext: bool = False, - disable_opt_rtn: bool = False, + disable_opt_rtn: Optional[bool] = None, seed: int = 42, low_cpu_mem_usage: bool = False, **kwargs, @@ -226,7 +226,7 @@ def __init__( act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True. enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False. device_map (str | dict, optional): Device placement map. Defaults to None. - disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False. + disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to None. enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False. **kwargs: Backward compatible options: - enable_alg_ext, quant_lm_head, lr, lr_scheduler, not_use_best_mse, dynamic_max_gap, @@ -389,6 +389,19 @@ def __init__( "for bits <= 2, it is recommended to enable `auto-round-best` " "and turn on `--enable_alg_ext` " ) + # Automatically adjust the disable_opt_rtn option if the user does not explicitly set it. + if ( + self.bits >= 8 + and self.act_bits >= 16 + and self.iters == 0 + and self.data_type == "int" + and disable_opt_rtn is None + ): + logger.warning("for INT8 RTN quantization, set `--disable_opt_rtn` as default.") + disable_opt_rtn = True + if disable_opt_rtn is None: + disable_otp_rtn = False + self.minmax_lr = minmax_lr or self.lr self.enable_alg_ext = enable_alg_ext self.not_use_best_mse = not_use_best_mse diff --git a/auto_round/compressors/config.py b/auto_round/compressors/config.py index 7c9398a91..4bab246c3 100644 --- a/auto_round/compressors/config.py +++ b/auto_round/compressors/config.py @@ -14,7 +14,7 @@ from __future__ import annotations from dataclasses import dataclass, fields -from typing import Any, Callable, Union +from typing import Any, Callable, Optional, Union import torch @@ -32,7 +32,7 @@ def __init__( self, # tuning amp: bool = True, - disable_opt_rtn: bool = True, + disable_opt_rtn: Optional[bool] = True, enable_alg_ext: bool = False, enable_minmax_tuning: bool = True, enable_norm_bias_tuning: bool = False, @@ -74,7 +74,7 @@ def __init__( Args: amp (bool): Whether to use automatic mixed precision (default is True). - disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False. + disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to True. enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False. enable_minmax_tuning (bool, optional): Enable weight min-max tuning. Defaults to True. enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning. @@ -247,7 +247,7 @@ def is_default(self): @dataclass class TuningExtraConfig(BaseExtraConfig): amp: bool = True - disable_opt_rtn: bool = True + disable_opt_rtn: Optional[bool] = True enable_alg_ext: bool = False enable_minmax_tuning: bool = True enable_norm_bias_tuning: bool = False diff --git a/docs/step_by_step.md b/docs/step_by_step.md index 8163ac394..260980895 100644 --- a/docs/step_by_step.md +++ b/docs/step_by_step.md @@ -393,7 +393,7 @@ We will try to optimize the RAM usage in the future. The RAM usage is about 1.1- | Qwen3-32B | W2A16/W4A16/W8A16 | OOM with 240G | --- | OOM with 240G | --- | | Qwen3-32B | MXFP4/MXFP8 | 160G | 200s * len of options | 200G | 240s * len of options | | Qwen3-32B | GGUF* | 210G | 80s * len of options | 200G | 60s * len of options | - + #### Limitations diff --git a/requirements.txt b/requirements.txt index 4f348bfdc..b84214f51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,7 +2,6 @@ accelerate datasets numpy -# packaging # for python version <= 3.9 py-cpuinfo threadpoolctl torch