Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round")

##### Algorithm Settings
- **`enable_alg_ext` (bool)**: [Experimental Feature] Only for `iters>0`. Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`.
- **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `False` (improved RTN enabled).
- **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `None` (improved RTN enabled).

##### Tuning Process Parameters
- **`iters` (int)**: Number of tuning iterations (default is `200`). Common values: 0 (RTN mode), 50 (with lr=5e-3 recommended), 1000. Higher values increase accuracy but slow down tuning.
Expand Down
4 changes: 3 additions & 1 deletion auto_round/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,9 @@ def __init__(self, *args, **kwargs):
)
tuning.add_argument(
"--disable_opt_rtn",
action="store_true",
"--disable-opt-rtn",
action=argparse.BooleanOptionalAction,
default=None,
help="Disable optimization for RTN (Round-To-Nearest) mode when iters=0. "
"RTN is fast but less accurate; keeping optimization enabled is recommended.",
)
Expand Down
8 changes: 4 additions & 4 deletions auto_round/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.
from __future__ import annotations

from typing import TYPE_CHECKING, Union
from typing import TYPE_CHECKING, Optional, Union

import torch

Expand Down Expand Up @@ -85,7 +85,7 @@ def __new__(
enable_adam: bool = False,
extra_config: ExtraConfig = None,
enable_alg_ext: bool = None,
disable_opt_rtn: bool = None,
disable_opt_rtn: Optional[bool] = None,
low_cpu_mem_usage: bool = False,
**kwargs,
) -> BaseCompressor:
Expand All @@ -112,7 +112,7 @@ def __new__(
enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2)
for better accuracy. Defaults to False.
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0) for fast quatnziation
with lower accuracy. Defaults to False.
with lower accuracy. Defaults to None.
low_cpu_mem_usage (bool, optional): Lower CPU memory mode. Defaults to False.

bits (int, optional): Weight quantization bits. Defaults to 4.
Expand Down Expand Up @@ -291,7 +291,7 @@ class AutoRoundLLM(LLMCompressor):
act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True.
enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False.
device_map (str | dict, optional): Device placement map. Defaults to None.
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False.
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to None.
enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False.
**kwargs: Backward compatible options:
- enable_alg_ext, quant_lm_head, lr, lr_scheduler, sampler, not_use_best_mse, dynamic_max_gap,
Expand Down
17 changes: 15 additions & 2 deletions auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def __init__(
device_map: Union[str, torch.device, int, dict] = 0,
enable_torch_compile: bool = False,
enable_alg_ext: bool = False,
disable_opt_rtn: bool = False,
disable_opt_rtn: Optional[bool] = None,
seed: int = 42,
low_cpu_mem_usage: bool = False,
**kwargs,
Expand Down Expand Up @@ -226,7 +226,7 @@ def __init__(
act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True.
enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False.
device_map (str | dict, optional): Device placement map. Defaults to None.
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False.
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to None.
enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False.
**kwargs: Backward compatible options:
- enable_alg_ext, quant_lm_head, lr, lr_scheduler, not_use_best_mse, dynamic_max_gap,
Expand Down Expand Up @@ -389,6 +389,19 @@ def __init__(
"for bits <= 2, it is recommended to enable `auto-round-best` " "and turn on `--enable_alg_ext` "
)

# Automatically adjust the disable_opt_rtn option if the user does not explicitly set it.
if (
self.bits >= 8
and self.act_bits >= 16
and self.iters == 0
and self.data_type == "int"
and disable_opt_rtn is None
):
logger.warning("for INT8 RTN quantization, set `--disable_opt_rtn` as default.")
disable_opt_rtn = True
if disable_opt_rtn is None:
disable_otp_rtn = False

self.minmax_lr = minmax_lr or self.lr
self.enable_alg_ext = enable_alg_ext
self.not_use_best_mse = not_use_best_mse
Expand Down
8 changes: 4 additions & 4 deletions auto_round/compressors/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from __future__ import annotations

from dataclasses import dataclass, fields
from typing import Any, Callable, Union
from typing import Any, Callable, Optional, Union

import torch

Expand All @@ -32,7 +32,7 @@ def __init__(
self,
# tuning
amp: bool = True,
disable_opt_rtn: bool = True,
disable_opt_rtn: Optional[bool] = True,
enable_alg_ext: bool = False,
enable_minmax_tuning: bool = True,
enable_norm_bias_tuning: bool = False,
Expand Down Expand Up @@ -74,7 +74,7 @@ def __init__(
Args:
amp (bool): Whether to use automatic mixed precision (default is True).
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False.
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to True.
enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False.
enable_minmax_tuning (bool, optional): Enable weight min-max tuning. Defaults to True.
enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning.
Expand Down Expand Up @@ -247,7 +247,7 @@ def is_default(self):
@dataclass
class TuningExtraConfig(BaseExtraConfig):
amp: bool = True
disable_opt_rtn: bool = True
disable_opt_rtn: Optional[bool] = True
enable_alg_ext: bool = False
enable_minmax_tuning: bool = True
enable_norm_bias_tuning: bool = False
Expand Down
2 changes: 1 addition & 1 deletion docs/step_by_step.md
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ We will try to optimize the RAM usage in the future. The RAM usage is about 1.1-
| Qwen3-32B | W2A16/W4A16/W8A16 | OOM with 240G | --- | OOM with 240G | --- |
| Qwen3-32B | MXFP4/MXFP8 | 160G | 200s * len of options | 200G | 240s * len of options |
| Qwen3-32B | GGUF* | 210G | 80s * len of options | 200G | 60s * len of options |
</details>
</details>


#### Limitations
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
accelerate
datasets
numpy
# packaging # for python version <= 3.9
py-cpuinfo
threadpoolctl
torch
Expand Down