Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ ar.quantize_and_save(output_dir="./qmodel", format="auto_round")

##### Algorithm Settings
- **`enable_alg_ext` (bool)**: [Experimental Feature] Only for `iters>0`. Enable algorithm variants for specific schemes (e.g., MXFP4/W2A16) that could bring notable improvements. Default is `False`.
- **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `False` (improved RTN enabled).
- **`disable_opt_rtn` (bool)**: Use pure RTN mode for specific schemes (e.g., GGUF and WOQ). Default is `None` (improved RTN enabled).

##### Tuning Process Parameters
- **`iters` (int)**: Number of tuning iterations (default is `200`). Common values: 0 (RTN mode), 50 (with lr=5e-3 recommended), 1000. Higher values increase accuracy but slow down tuning.
Expand Down
4 changes: 3 additions & 1 deletion auto_round/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,9 @@ def __init__(self, *args, **kwargs):
)
tuning.add_argument(
"--disable_opt_rtn",
action="store_true",
"--disable-opt-rtn",
action=argparse.BooleanOptionalAction,
default=None,
help="Disable optimization for RTN (Round-To-Nearest) mode when iters=0. "
"RTN is fast but less accurate; keeping optimization enabled is recommended.",
)
Expand Down
8 changes: 4 additions & 4 deletions auto_round/autoround.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# limitations under the License.
from __future__ import annotations

from typing import TYPE_CHECKING, Union
from typing import TYPE_CHECKING, Optional, Union

import torch

Expand Down Expand Up @@ -87,7 +87,7 @@ def __new__(
# for MLLM and Diffusion
extra_config: ExtraConfig = None,
enable_alg_ext: bool = None,
disable_opt_rtn: bool = None,
disable_opt_rtn: Optional[bool] = None,
low_cpu_mem_usage: bool = False,
**kwargs,
) -> BaseCompressor:
Expand Down Expand Up @@ -123,7 +123,7 @@ def __new__(
act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True.
enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False.
device_map (str | dict, optional): Device placement map. Defaults to None.
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False.
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to None.
enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False.
model_dtype (str): model dtype used to load pre-trained model.
**kwargs: Backward compatible options:
Expand Down Expand Up @@ -283,7 +283,7 @@ class AutoRoundLLM(LLMCompressor):
act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True.
enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False.
device_map (str | dict, optional): Device placement map. Defaults to None.
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False.
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to None.
enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False.
**kwargs: Backward compatible options:
- enable_alg_ext, quant_lm_head, lr, lr_scheduler, sampler, not_use_best_mse, dynamic_max_gap,
Expand Down
7 changes: 5 additions & 2 deletions auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def __init__(
device_map: Union[str, torch.device, int, dict] = 0,
enable_torch_compile: bool = False,
enable_alg_ext: bool = False,
disable_opt_rtn: bool = False,
disable_opt_rtn: Optional[bool] = None,
seed: int = 42,
low_cpu_mem_usage: bool = False,
**kwargs,
Expand Down Expand Up @@ -226,7 +226,7 @@ def __init__(
act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True.
enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False.
device_map (str | dict, optional): Device placement map. Defaults to None.
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False.
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to None.
enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False.
**kwargs: Backward compatible options:
- enable_alg_ext, quant_lm_head, lr, lr_scheduler, not_use_best_mse, dynamic_max_gap,
Expand Down Expand Up @@ -388,6 +388,9 @@ def __init__(
logger.warning(
"for bits <= 2, it is recommended to enable `auto-round-best` " "and turn on `--enable_alg_ext` "
)
if self.bits == 8 and self.iters == 0 and self.data_type == "int" and disable_opt_rtn is None:
logger.warning("for INT8 RTN quantization, set `--disable_opt_rtn` as default.")
disable_opt_rtn = True

self.minmax_lr = minmax_lr or self.lr
self.enable_alg_ext = enable_alg_ext
Expand Down
8 changes: 4 additions & 4 deletions auto_round/compressors/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from __future__ import annotations

from dataclasses import dataclass, fields
from typing import Any, Callable, Union
from typing import Any, Callable, Optional, Union

import torch

Expand All @@ -32,7 +32,7 @@ def __init__(
self,
# tuning
amp: bool = True,
disable_opt_rtn: bool = True,
disable_opt_rtn: Optional[bool] = True,
enable_alg_ext: bool = False,
enable_minmax_tuning: bool = True,
enable_norm_bias_tuning: bool = False,
Expand Down Expand Up @@ -74,7 +74,7 @@ def __init__(

Args:
amp (bool): Whether to use automatic mixed precision (default is True).
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False.
disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to True.
enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False.
enable_minmax_tuning (bool, optional): Enable weight min-max tuning. Defaults to True.
enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning.
Expand Down Expand Up @@ -247,7 +247,7 @@ def is_default(self):
@dataclass
class TuningExtraConfig(BaseExtraConfig):
amp: bool = True
disable_opt_rtn: bool = True
disable_opt_rtn: Optional[bool] = True
enable_alg_ext: bool = False
enable_minmax_tuning: bool = True
enable_norm_bias_tuning: bool = False
Expand Down
2 changes: 1 addition & 1 deletion docs/step_by_step.md
Original file line number Diff line number Diff line change
Expand Up @@ -393,7 +393,7 @@ We will try to optimize the RAM usage in the future. The RAM usage is about 1.1-
| Qwen3-32B | W2A16/W4A16/W8A16 | OOM with 240G | --- | OOM with 240G | --- |
| Qwen3-32B | MXFP4/MXFP8 | 160G | 200s * len of options | 200G | 240s * len of options |
| Qwen3-32B | GGUF* | 210G | 80s * len of options | 200G | 60s * len of options |
</details>
</details>


#### Limitations
Expand Down
8 changes: 1 addition & 7 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,10 +1,4 @@
# 1.5.1<accelerate<1.10.0 may cause potentially high RAM usage; versions 1.10.0 or above are recommended.
accelerate
datasets
numpy
# packaging # for python version <= 3.9
py-cpuinfo
threadpoolctl
torch
tqdm
transformers>=4.38
numpy
Loading