intel · wenhuach21 · Jan 8, 2026 · Jan 7, 2026 · Jan 7, 2026 · Jan 8, 2026
diff --git a/README.md b/README.md
@@ -195,7 +195,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round")
 
 ##### Algorithm Settings
 - **`enable_alg_ext` (bool)**: [Experimental Feature] Only for `iters>0`. Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`.
-- **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `False` (improved RTN enabled).
+- **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `None` (improved RTN enabled).
 
 ##### Tuning Process Parameters
 - **`iters` (int)**: Number of tuning iterations (default is `200`). Common values: 0 (RTN mode), 50 (with lr=5e-3 recommended), 1000. Higher values increase accuracy but slow down tuning.

diff --git a/auto_round/__main__.py b/auto_round/__main__.py
@@ -255,7 +255,9 @@ def __init__(self, *args, **kwargs):
         )
         tuning.add_argument(
             "--disable_opt_rtn",
-            action="store_true",
+            "--disable-opt-rtn",
+            action=argparse.BooleanOptionalAction,
+            default=None,
             help="Disable optimization for RTN (Round-To-Nearest) mode when iters=0. "
             "RTN is fast but less accurate; keeping optimization enabled is recommended.",
         )

diff --git a/auto_round/autoround.py b/auto_round/autoround.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Union
+from typing import TYPE_CHECKING, Optional, Union
 
 import torch
 
@@ -87,7 +87,7 @@ def __new__(
         # for MLLM and Diffusion
         extra_config: ExtraConfig = None,
         enable_alg_ext: bool = None,
-        disable_opt_rtn: bool = None,
+        disable_opt_rtn: Optional[bool] = None,
         low_cpu_mem_usage: bool = False,
         **kwargs,
     ) -> BaseCompressor:
@@ -123,7 +123,7 @@ def __new__(
             act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True.
             enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False.
             device_map (str | dict, optional): Device placement map. Defaults to None.
-            disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False.
+            disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to None.
             enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False.
             model_dtype (str): model dtype used to load pre-trained model.
             **kwargs: Backward compatible options:
@@ -283,7 +283,7 @@ class AutoRoundLLM(LLMCompressor):
         act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True.
         enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False.
         device_map (str | dict, optional): Device placement map. Defaults to None.
-        disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False.
+        disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to None.
         enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False.
         **kwargs: Backward compatible options:
             - enable_alg_ext, quant_lm_head, lr, lr_scheduler, sampler, not_use_best_mse, dynamic_max_gap,

diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
@@ -189,7 +189,7 @@ def __init__(
         device_map: Union[str, torch.device, int, dict] = 0,
         enable_torch_compile: bool = False,
         enable_alg_ext: bool = False,
-        disable_opt_rtn: bool = False,
+        disable_opt_rtn: Optional[bool] = None,
         seed: int = 42,
         low_cpu_mem_usage: bool = False,
         **kwargs,
@@ -226,7 +226,7 @@ def __init__(
             act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True.
             enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False.
             device_map (str | dict, optional): Device placement map. Defaults to None.
-            disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False.
+            disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to None.
             enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False.
             **kwargs: Backward compatible options:
                 - enable_alg_ext, quant_lm_head, lr, lr_scheduler, not_use_best_mse, dynamic_max_gap,
@@ -388,6 +388,9 @@ def __init__(
             logger.warning(
                 "for bits <= 2, it is recommended to enable `auto-round-best` " "and turn on `--enable_alg_ext` "
             )
+        if self.bits == 8 and self.iters == 0 and self.data_type == "int" and disable_opt_rtn is None:
+            logger.warning("for INT8 RTN quantization, set `--disable_opt_rtn` as default.")
+            disable_opt_rtn = True
 
         self.minmax_lr = minmax_lr or self.lr
         self.enable_alg_ext = enable_alg_ext

diff --git a/auto_round/compressors/config.py b/auto_round/compressors/config.py
@@ -14,7 +14,7 @@
 from __future__ import annotations
 
 from dataclasses import dataclass, fields
-from typing import Any, Callable, Union
+from typing import Any, Callable, Optional, Union
 
 import torch
 
@@ -32,7 +32,7 @@ def __init__(
         self,
         # tuning
         amp: bool = True,
-        disable_opt_rtn: bool = True,
+        disable_opt_rtn: Optional[bool] = True,
         enable_alg_ext: bool = False,
         enable_minmax_tuning: bool = True,
         enable_norm_bias_tuning: bool = False,
@@ -74,7 +74,7 @@ def __init__(
 
         Args:
             amp (bool): Whether to use automatic mixed precision (default is True).
-            disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False.
+            disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to True.
             enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False.
             enable_minmax_tuning (bool, optional): Enable weight min-max tuning. Defaults to True.
             enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning.
@@ -247,7 +247,7 @@ def is_default(self):
 @dataclass
 class TuningExtraConfig(BaseExtraConfig):
     amp: bool = True
-    disable_opt_rtn: bool = True
+    disable_opt_rtn: Optional[bool] = True
     enable_alg_ext: bool = False
     enable_minmax_tuning: bool = True
     enable_norm_bias_tuning: bool = False

diff --git a/docs/step_by_step.md b/docs/step_by_step.md
@@ -393,7 +393,7 @@ We will try to optimize the RAM usage in the future. The RAM usage is about 1.1-
 | Qwen3-32B | W2A16/W4A16/W8A16 | OOM with 240G                   | ---                           | OOM with 240G                    | ---                              |
 | Qwen3-32B | MXFP4/MXFP8       | 160G                            | 200s * len of options         | 200G                             | 240s * len of options            |
 | Qwen3-32B | GGUF*             | 210G                            | 80s * len of options          | 200G                             | 60s * len of options             |
-</details> 
+</details>
 
 
 #### Limitations

diff --git a/requirements.txt b/requirements.txt
@@ -1,10 +1,4 @@
 # 1.5.1<accelerate<1.10.0 may cause potentially high RAM usage; versions 1.10.0 or above are recommended.
 accelerate
 datasets
-numpy
-# packaging # for python version <= 3.9
-py-cpuinfo
-threadpoolctl
-torch
-tqdm
-transformers>=4.38
+numpy