From 52e5bf68bf02e5329dc0792e93b6f94d96e1eeaa Mon Sep 17 00:00:00 2001 From: Weiwei Date: Thu, 8 Jan 2026 17:18:45 +0800 Subject: [PATCH] =?UTF-8?q?Revert=20"set=20disable=5Fopt=5Frtn=20to=20opti?= =?UTF-8?q?onal=20bool=20and=20change=20default=20value=20=20to=20Non?= =?UTF-8?q?=E2=80=A6"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This reverts commit bef28c9a116f4ea871dc0d7411785c6084eda0cb. --- README.md | 2 +- auto_round/__main__.py | 4 +--- auto_round/autoround.py | 8 ++++---- auto_round/compressors/base.py | 17 ++--------------- auto_round/compressors/config.py | 8 ++++---- docs/step_by_step.md | 2 +- requirements.txt | 1 + 7 files changed, 14 insertions(+), 28 deletions(-) diff --git a/README.md b/README.md index 335a8f33f..f24f28ddf 100644 --- a/README.md +++ b/README.md @@ -195,7 +195,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round") ##### Algorithm Settings - **`enable_alg_ext` (bool)**: [Experimental Feature] Only for `iters>0`. Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`. -- **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `None` (improved RTN enabled). +- **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `False` (improved RTN enabled). ##### Tuning Process Parameters - **`iters` (int)**: Number of tuning iterations (default is `200`). Common values: 0 (RTN mode), 50 (with lr=5e-3 recommended), 1000. Higher values increase accuracy but slow down tuning. diff --git a/auto_round/__main__.py b/auto_round/__main__.py index 6d95b4864..adbd8fe02 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -255,9 +255,7 @@ def __init__(self, *args, **kwargs): ) tuning.add_argument( "--disable_opt_rtn", - "--disable-opt-rtn", - action=argparse.BooleanOptionalAction, - default=None, + action="store_true", help="Disable optimization for RTN (Round-To-Nearest) mode when iters=0. " "RTN is fast but less accurate; keeping optimization enabled is recommended.", ) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index 51c850c16..050281983 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import annotations -from typing import TYPE_CHECKING, Optional, Union +from typing import TYPE_CHECKING, Union import torch @@ -85,7 +85,7 @@ def __new__( enable_adam: bool = False, extra_config: ExtraConfig = None, enable_alg_ext: bool = None, - disable_opt_rtn: Optional[bool] = None, + disable_opt_rtn: bool = None, low_cpu_mem_usage: bool = False, **kwargs, ) -> BaseCompressor: @@ -112,7 +112,7 @@ def __new__( enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2) for better accuracy. Defaults to False. disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0) for fast quatnziation - with lower accuracy. Defaults to None. + with lower accuracy. Defaults to False. low_cpu_mem_usage (bool, optional): Lower CPU memory mode. Defaults to False. bits (int, optional): Weight quantization bits. Defaults to 4. @@ -291,7 +291,7 @@ class AutoRoundLLM(LLMCompressor): act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True. enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False. device_map (str | dict, optional): Device placement map. Defaults to None. - disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to None. + disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False. enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False. **kwargs: Backward compatible options: - enable_alg_ext, quant_lm_head, lr, lr_scheduler, sampler, not_use_best_mse, dynamic_max_gap, diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index cd9fc516c..2665d74ef 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -189,7 +189,7 @@ def __init__( device_map: Union[str, torch.device, int, dict] = 0, enable_torch_compile: bool = False, enable_alg_ext: bool = False, - disable_opt_rtn: Optional[bool] = None, + disable_opt_rtn: bool = False, seed: int = 42, low_cpu_mem_usage: bool = False, **kwargs, @@ -226,7 +226,7 @@ def __init__( act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True. enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False. device_map (str | dict, optional): Device placement map. Defaults to None. - disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to None. + disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False. enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False. **kwargs: Backward compatible options: - enable_alg_ext, quant_lm_head, lr, lr_scheduler, not_use_best_mse, dynamic_max_gap, @@ -389,19 +389,6 @@ def __init__( "for bits <= 2, it is recommended to enable `auto-round-best` " "and turn on `--enable_alg_ext` " ) - # Automatically adjust the disable_opt_rtn option if the user does not explicitly set it. - if ( - self.bits >= 8 - and self.act_bits >= 16 - and self.iters == 0 - and self.data_type == "int" - and disable_opt_rtn is None - ): - logger.warning("for INT8 RTN quantization, set `--disable_opt_rtn` as default.") - disable_opt_rtn = True - if disable_opt_rtn is None: - disable_otp_rtn = False - self.minmax_lr = minmax_lr or self.lr self.enable_alg_ext = enable_alg_ext self.not_use_best_mse = not_use_best_mse diff --git a/auto_round/compressors/config.py b/auto_round/compressors/config.py index 4bab246c3..7c9398a91 100644 --- a/auto_round/compressors/config.py +++ b/auto_round/compressors/config.py @@ -14,7 +14,7 @@ from __future__ import annotations from dataclasses import dataclass, fields -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Union import torch @@ -32,7 +32,7 @@ def __init__( self, # tuning amp: bool = True, - disable_opt_rtn: Optional[bool] = True, + disable_opt_rtn: bool = True, enable_alg_ext: bool = False, enable_minmax_tuning: bool = True, enable_norm_bias_tuning: bool = False, @@ -74,7 +74,7 @@ def __init__( Args: amp (bool): Whether to use automatic mixed precision (default is True). - disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to True. + disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False. enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False. enable_minmax_tuning (bool, optional): Enable weight min-max tuning. Defaults to True. enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning. @@ -247,7 +247,7 @@ def is_default(self): @dataclass class TuningExtraConfig(BaseExtraConfig): amp: bool = True - disable_opt_rtn: Optional[bool] = True + disable_opt_rtn: bool = True enable_alg_ext: bool = False enable_minmax_tuning: bool = True enable_norm_bias_tuning: bool = False diff --git a/docs/step_by_step.md b/docs/step_by_step.md index 260980895..8163ac394 100644 --- a/docs/step_by_step.md +++ b/docs/step_by_step.md @@ -393,7 +393,7 @@ We will try to optimize the RAM usage in the future. The RAM usage is about 1.1- | Qwen3-32B | W2A16/W4A16/W8A16 | OOM with 240G | --- | OOM with 240G | --- | | Qwen3-32B | MXFP4/MXFP8 | 160G | 200s * len of options | 200G | 240s * len of options | | Qwen3-32B | GGUF* | 210G | 80s * len of options | 200G | 60s * len of options | - + #### Limitations diff --git a/requirements.txt b/requirements.txt index b84214f51..4f348bfdc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,7 @@ accelerate datasets numpy +# packaging # for python version <= 3.9 py-cpuinfo threadpoolctl torch