diff --git a/README.md b/README.md index 1e5b5c30e..f24f28ddf 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ AutoRound is an advanced quantization toolkit designed for Large Language Models (LLMs) and Vision-Language Models (VLMs). It achieves high accuracy at ultra-low bit widths (2–4 bits) with minimal tuning by leveraging **sign-gradient descent** and providing broad hardware compatibility. -See our [paper](https://arxiv.org/pdf/2309.05516) for more details. For usage instructions, please refer to the [User Guide](./docs/step_by_step.md). +See our papers [SignRoundV1](https://arxiv.org/pdf/2309.05516) and [SignRoundV2](http://arxiv.org/abs/2512.04746) for more details. For usage instructions, please refer to the [User Guide](./docs/step_by_step.md).
@@ -39,7 +39,7 @@ See our [paper](https://arxiv.org/pdf/2309.05516) for more details. For usage in
* [2025/10] AutoRound has been integrated into **SGLang**: [*Usage*](https://docs.sglang.io/advanced_features/quantization.html#using-auto-round), [*LMSYS Blog*](https://lmsys.org/blog/2025-11-13-AutoRound/), [*X post*](https://x.com/lmsysorg/status/1991977019220148650?s=20), [*Intel blog*](https://community.intel.com/t5/Blogs/Tech-Innovation/Artificial-Intelligence-AI/AutoRound-Meets-SGLang-Enabling-Quantized-Model-Inference-with/post/1727196), [*Linkedin*](https://www.linkedin.com/feed/update/urn:li:activity:7397742859354857472).
-* [2025/10] A **mix precision** algorithm is available to generate schemes in minutes: [*Usage*](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md#autoscheme), [*Accuracy*](./docs/auto_scheme_acc.md).
+* [2025/10] A **mixed precision** algorithm is available to generate schemes in minutes: [*Usage*](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md#autoscheme), [*Accuracy*](./docs/auto_scheme_acc.md).
* [2025/09] **MXFP4** and **NVFP4** dtypes is available: [*Accuracy*](./docs/mxnv_acc.md).
@@ -339,8 +339,15 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
## Publications & Events
+[SignRoundV2: Closing the Performance Gap in Extremely Low-Bit Post-Training Quantization for LLMs](https://arxiv.org/abs/2512.04746) (202512 paper)
-[Publication List](./docs/publication_list.md).
+[Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLM](https://aclanthology.org/2024.findings-emnlp.662/) (202309 paper)
+
+[TEQ: Trainable Equivalent Transformation for Quantization of LLMs](https://arxiv.org/abs/2310.10944) (202310 paper)
+
+[Effective Post-Training Quantization for Large Language Models](https://medium.com/intel-analytics-software/effective-post-training-quantization-for-large-language-models-with-enhanced-smoothquant-approach-93e9d104fb98) (202304 blog)
+
+Check out [Full Publication List](./docs/publication_list.md).
## Acknowledgement
Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound.
diff --git a/auto_round/__main__.py b/auto_round/__main__.py
index d127cf571..adbd8fe02 100644
--- a/auto_round/__main__.py
+++ b/auto_round/__main__.py
@@ -303,6 +303,7 @@ def __init__(self, *args, **kwargs):
help="Quantize the lm_head. " "Usually kept in higher precision for better output quality.",
)
scheme.add_argument(
+ "--ignore_layers",
"--fp_layers",
default="",
type=str,
@@ -599,7 +600,7 @@ def tune(args):
super_bits=args.super_bits,
super_group_size=args.super_group_size,
quant_lm_head=args.quant_lm_head,
- fp_layers=args.fp_layers,
+ ignore_layers=args.ignore_layers,
static_kv_dtype=args.static_kv_dtype,
static_attention_dtype=args.static_attention_dtype,
)
diff --git a/auto_round/autoround.py b/auto_round/autoround.py
index a15919cfb..050281983 100644
--- a/auto_round/autoround.py
+++ b/auto_round/autoround.py
@@ -82,9 +82,7 @@ def __new__(
device_map: Union[str, torch.device, int, dict] = 0,
enable_torch_compile: bool = False,
seed: int = 42,
- # for adam
enable_adam: bool = False,
- # for MLLM and Diffusion
extra_config: ExtraConfig = None,
enable_alg_ext: bool = None,
disable_opt_rtn: bool = None,
@@ -96,36 +94,42 @@ def __new__(
Args:
model (torch.nn.Module | str): Model object or model name to load.
tokenizer: Tokenizer for text processing. Required if `model` is not a string and `iters > 0`.
+ platform: The platform to download pretrained model, options: ["hf", "model_scope"]
scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations
- bits (int, optional): Weight quantization bits. Defaults to 4.
- group_size (int, optional): Weight quantization group size. Defaults to 128.
- sym (bool, optional): Symmetric weight quantization. Defaults to True.
layer_config (dict, optional): Layer-wise quantization config. Defaults to None.
- batch_size (int, optional): Calibration batch size. Defaults to 8.
- amp (bool, optional): Use AMP for tuning. Defaults to True.
- device (str | torch.device | int, optional): Compute device. Defaults to 0.
dataset (str | list | tuple | DataLoader, optional): Calibration data. Defaults to "NeelNanda/pile-10k".
- enable_minmax_tuning (bool, optional): Enable weight min-max tuning. Defaults to True.
- lr (float, optional): Learning rate; if None, set to 1.0 / iters except when iters==0.
- minmax_lr (float, optional): Learning rate for min-max tuning; defaults to `lr`.
- low_gpu_mem_usage (bool, optional): Lower GPU memory mode. Defaults to False.
- low_cpu_mem_usage (bool, optional): Lower CPU memory mode. Defaults to False.
iters (int, optional): Optimization iterations. Defaults to 200.
seqlen (int, optional): Calibration sequence length. Defaults to 2048.
nsamples (int, optional): Number of calibration samples. Defaults to 128.
- seed (int, optional): Random seed. Defaults to 42.
+ batch_size (int, optional): Calibration batch size. Defaults to 8.
gradient_accumulate_steps (int, optional): Gradient accumulation steps. Defaults to 1.
+ low_gpu_mem_usage (bool, optional): Lower GPU memory mode. Defaults to False.
+ device_map (str | dict, optional): Device map for each module. Defaults to 0.
+ enable_torch_compile (bool, optional): Enable torch.compile for low cost in quantization. Defaults to False.
+ seed (int, optional): Random seed. Defaults to 42.
+ enable_adam (bool, optional): Enable Adam-based optimizer. Defaults to False.
+ extra_config(ExtraConfig, optional): Extra configuration for lots of configurations. Defaults to None.
+ enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2)
+ for better accuracy. Defaults to False.
+ disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0) for fast quatnziation
+ with lower accuracy. Defaults to False.
+ low_cpu_mem_usage (bool, optional): Lower CPU memory mode. Defaults to False.
+
+ bits (int, optional): Weight quantization bits. Defaults to 4.
+ group_size (int, optional): Weight quantization group size. Defaults to 128.
+ sym (bool, optional): Symmetric weight quantization. Defaults to True.
data_type (str, optional): Weight data type string, e.g., "int". Defaults to "int".
act_bits (int, optional): Activation quantization bits. Defaults to 16.
act_group_size (int, optional): Activation group size. Defaults to None.
act_sym (bool, optional): Symmetric activation quantization. Defaults to None.
act_data_type (str, optional): Activation data type; inherits weight dtype if None and act_bits < 16.
act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True.
- enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False.
- device_map (str | dict, optional): Device placement map. Defaults to None.
- disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False.
- enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False.
model_dtype (str): model dtype used to load pre-trained model.
+ amp (bool, optional): Use AMP for tuning. Defaults to True.
+ enable_minmax_tuning (bool, optional): Enable weight min-max tuning. Defaults to True.
+ lr (float, optional): Learning rate; if None, set to 1.0 / iters except when iters==0.
+ minmax_lr (float, optional): Learning rate for min-max tuning; defaults to `lr`.
+
**kwargs: Backward compatible options:
- enable_alg_ext, quant_lm_head, lr, lr_scheduler, sampler, not_use_best_mse, dynamic_max_gap,
super_group_size, super_bits, scale_dtype ("fp16" etc.),
@@ -140,18 +144,17 @@ def __new__(
>>> layer_config = {
... "layer1": {
- ... "data_type": "int",
- ... "bits": 4,
+ ... "bits": 3,
... "group_size": 128,
... "sym": True,
- ... "act_data_type": None,
- ... "act_bits": 16,
- ... "act_group_size": None,
- ... "act_sym": None,
... },
+ ... "layer2": {
+ ... "W8A16"
+ ... }
... # ...
... }
"""
+
model_cls = []
if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model, platform=platform):
@@ -179,6 +182,11 @@ def __new__(
kwargs["enable_alg_ext"] = enable_alg_ext
if disable_opt_rtn is not None:
kwargs["disable_opt_rtn"] = disable_opt_rtn
+ if "fp_layers" in kwargs:
+ logger.warning_once(
+ "'fp_layers' is deprecated, please use 'ignore_layers' to set layers not to be quantized."
+ )
+ kwargs["ignore"] = kwargs.pop("fp_layers")
ar = dynamic_compressor(
model=model,
tokenizer=tokenizer,
diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py
index 8464aa4fb..2665d74ef 100644
--- a/auto_round/compressors/base.py
+++ b/auto_round/compressors/base.py
@@ -312,7 +312,7 @@ def __init__(
self.platform = platform
self.quant_lm_head = kwargs.pop("quant_lm_head", False)
- self.fp_layers = kwargs.pop("fp_layers", "")
+ self.ignore_layers = kwargs.pop("ignore_layers", "")
self.supported_types = SUPPORTED_LAYER_TYPES
self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES
self.scale_dtype = convert_dtype_str2torch(scale_dtype)
@@ -504,7 +504,7 @@ def _gen_auto_scheme(
self.supported_types,
self.inner_supported_types,
self.quant_block_list,
- self.fp_layers,
+ self.ignore_layers,
self.quant_lm_head,
enable_gguf_official_mixed=False,
is_mllm=self.mllm,
@@ -1398,7 +1398,7 @@ def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True)
self.supported_types,
self.inner_supported_types,
self.quant_block_list,
- self.fp_layers,
+ self.ignore_layers,
self.quant_lm_head,
enable_gguf_official_mixed=enable_gguf_official_mixed,
is_mllm=self.mllm,
diff --git a/auto_round/compressors/config.py b/auto_round/compressors/config.py
index 733ed6282..7c9398a91 100644
--- a/auto_round/compressors/config.py
+++ b/auto_round/compressors/config.py
@@ -58,7 +58,7 @@ def __init__(
super_group_size: int = None,
static_kv_dtype: Union[str, torch.dtype] = None,
quant_lm_head: bool = False,
- fp_layers: str = None,
+ ignore_layers: str = None,
# mllm
processor: Callable = None,
image_processor: Callable = None,
@@ -139,7 +139,7 @@ def __init__(
super_group_size=super_group_size,
static_kv_dtype=static_kv_dtype,
quant_lm_head=quant_lm_head,
- fp_layers=fp_layers,
+ ignore_layers=ignore_layers,
)
self.mllm_config = MLLMExtraConfig(
processor=processor,
@@ -277,7 +277,7 @@ class SchemeExtraConfig(BaseExtraConfig):
static_kv_dtype: Union[str, torch.dtype] = None
static_attention_dtype: Union[str, torch.dtype] = None
quant_lm_head: bool = False
- fp_layers: str = None
+ ignore_layers: str = None
@dataclass
diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py
index 266a90198..b5ea27dea 100644
--- a/auto_round/compressors/utils.py
+++ b/auto_round/compressors/utils.py
@@ -211,7 +211,7 @@ def set_layer_config(
supported_types: tuple,
inner_supported_types: tuple,
quant_block_list=None,
- fp_layers: str = "",
+ ignore_layers: str = "",
quant_lm_head: bool = False,
enable_gguf_official_mixed: bool = True,
is_mllm: bool = False,
@@ -261,8 +261,8 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str
scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",)
layer_config = copy.deepcopy(layer_config) or {}
- # 1. fp_layers -> force 16
- for name in get_fp_layer_names(model, fp_layers):
+ # 1. ignore_layers -> force 16
+ for name in get_fp_layer_names(model, ignore_layers):
layer_config[name] = {
"bits": 16,
"act_bits": 16,
@@ -852,7 +852,7 @@ def _set_config(config, target_config):
return layer_config, gguf_format_config
-def get_fp_layer_names(model: torch.nn.Module, fp_layers: str):
+def get_fp_layer_names(model: torch.nn.Module, ignore_layers: str):
"""Identifies and returns layers in the model to exclude from quantization.
This function processes a comma-separated list of fully precision (FP) layers,
@@ -861,7 +861,7 @@ def get_fp_layer_names(model: torch.nn.Module, fp_layers: str):
Args:
model (torch.nn.Module): The model whose layers will be inspected.
- fp_layers (str): A comma-separated string of layer names to be excluded
+ ignore_layers (str): A comma-separated string of layer names to be excluded
from quantization. Whitespace is ignored in this string.
Returns:
@@ -870,16 +870,16 @@ def get_fp_layer_names(model: torch.nn.Module, fp_layers: str):
"""
from auto_round.utils import SUPPORTED_LAYER_TYPES
- if not fp_layers:
+ if not ignore_layers:
return []
- fp_layers = fp_layers.replace(" ", "").split(",")
+ ignore_layers = ignore_layers.replace(" ", "").split(",")
all_layer_names = []
for n, m in model.named_modules():
if type(m) in SUPPORTED_LAYER_TYPES:
all_layer_names.append(n)
not_to_quantized_layers = []
- for fp_layer in fp_layers:
+ for fp_layer in ignore_layers:
if fp_layer == "":
continue
if fp_layer in all_layer_names:
diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py
index a5176c1db..52b35f9a4 100644
--- a/auto_round/export/export_to_autoround/export.py
+++ b/auto_round/export/export_to_autoround/export.py
@@ -288,7 +288,7 @@ def save_quantized_as_autoround(
neq_keys = check_neq_config(cfg, **{k: quantization_config[k] for k in scheme_keys})
if len(neq_keys) > 0:
extra_config[layer_name] = {}
- for key in scheme_keys:
+ for key in neq_keys:
if cfg.get(key) is not None:
extra_config[layer_name][key] = cfg[key]
diff --git a/docs/tips_and_tricks.md b/docs/tips_and_tricks.md
index 8d490b773..3f9177fc2 100644
--- a/docs/tips_and_tricks.md
+++ b/docs/tips_and_tricks.md
@@ -135,7 +135,7 @@ cause the outputs to become excessively large. Even when a fake QDQ model is fin
with the INT4 kernel.
**Suggestion: It is recommended not to quantize layers with large output values**. In AutoRound (>=0.4), you can
-use `--fp_layers "xxx,xxx"` to exclude these layers.
+use `--ignore_layers "xxx,xxx"` to exclude these layers.
**Reasoning**: While adjusting the quantization configuration (symmetric/asymmetric) or using `clamp_to_range` may
provide some benefit, configuration tuning can be tedious, and `clamp_to_range` is not always effective. Therefore, it
diff --git a/test/test_cpu/test_act_quantization.py b/test/test_cpu/test_act_quantization.py
index 6e9152ab2..47cda3599 100644
--- a/test/test_cpu/test_act_quantization.py
+++ b/test/test_cpu/test_act_quantization.py
@@ -173,15 +173,7 @@ def test_WOQ_config_INT_saving(self, tiny_opt_model_path, dataloader):
# check inblock layer config values
kproj_config = extra_config["model.decoder.layers.1.self_attn.k_proj"]
- assert "act_data_type" in kproj_config.keys() and kproj_config["act_data_type"] == "float"
- assert "act_bits" in kproj_config.keys() and kproj_config["act_bits"] == 16
- assert "act_group_size" in kproj_config.keys() and kproj_config["act_group_size"] == 128
- assert "act_sym" in kproj_config.keys() and not kproj_config["act_sym"]
- assert "data_type" in kproj_config.keys() and kproj_config["data_type"] == "int"
assert "bits" in kproj_config.keys() and kproj_config["bits"] == 8
- assert "group_size" in kproj_config.keys() and kproj_config["group_size"] == 128
- assert "sym" in kproj_config.keys() and not kproj_config["sym"]
- assert "act_dynamic" in kproj_config.keys() and kproj_config["act_dynamic"]
shutil.rmtree(quantized_model_path, ignore_errors=True)
def test_act_config_FP8_saving(self, tiny_opt_model_path, dataloader):
@@ -214,12 +206,6 @@ def test_act_config_FP8_saving(self, tiny_opt_model_path, dataloader):
# check inblock layer config values
kproj_config = extra_config["model.decoder.layers.0.self_attn.k_proj"]
- assert "act_data_type" in kproj_config.keys() and kproj_config["act_data_type"] == "fp"
assert "act_bits" in kproj_config.keys() and kproj_config["act_bits"] == 16
- assert "act_group_size" in kproj_config.keys() and kproj_config["act_group_size"] == 0
- assert "act_sym" in kproj_config.keys() and kproj_config["act_sym"]
- assert "data_type" in kproj_config.keys() and kproj_config["data_type"] == "fp"
- assert "bits" in kproj_config.keys() and kproj_config["bits"] == 8
assert "group_size" in kproj_config.keys() and kproj_config["group_size"] == 0
- assert "sym" in kproj_config.keys() and kproj_config["sym"]
shutil.rmtree(quantized_model_path, ignore_errors=True)
diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
index 92e9d620e..4b89a3a66 100644
--- a/test/test_cpu/test_gguf_format.py
+++ b/test/test_cpu/test_gguf_format.py
@@ -230,7 +230,7 @@ def test_qtype_setting(self):
ar.supported_types,
ar.inner_supported_types,
ar.quant_block_list,
- ar.fp_layers,
+ ar.ignore_layers,
ar.quant_lm_head,
enable_gguf_official_mixed=True,
is_mllm=ar.mllm,
@@ -249,7 +249,7 @@ def test_qtype_setting(self):
ar.supported_types,
ar.inner_supported_types,
ar.quant_block_list,
- ar.fp_layers,
+ ar.ignore_layers,
ar.quant_lm_head,
enable_gguf_official_mixed=True,
is_mllm=ar.mllm,
@@ -271,7 +271,7 @@ def test_qtype_setting(self):
ar.supported_types,
ar.inner_supported_types,
ar.quant_block_list,
- ar.fp_layers,
+ ar.ignore_layers,
ar.quant_lm_head,
enable_gguf_official_mixed=True,
is_mllm=ar.mllm,
diff --git a/test/test_cpu/test_moe_alignment.py b/test/test_cpu/test_moe_alignment.py
index b3dfe5a61..3e689a7e0 100644
--- a/test/test_cpu/test_moe_alignment.py
+++ b/test/test_cpu/test_moe_alignment.py
@@ -41,7 +41,7 @@ def test_moe_scale_alignment_fp8_static(setup_deepseek_v2_lite):
nsamples=4,
iters=0, # RTN for faster testing
seqlen=32,
- fp_layers="self_attn,lm_head",
+ ignore_layers="self_attn,lm_head",
)
quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
diff --git a/test/test_cpu/test_moe_model.py b/test/test_cpu/test_moe_model.py
index d60d975fb..397e74820 100644
--- a/test/test_cpu/test_moe_model.py
+++ b/test/test_cpu/test_moe_model.py
@@ -65,7 +65,7 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0):
nsamples=2,
iters=iters,
seqlen=32,
- fp_layers="self_attn,router,lm_head,mlp.gate",
+ ignore_layers="self_attn,router,lm_head,mlp.gate",
)
quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
return quantized_model
@@ -154,7 +154,7 @@ def test_qwen3_vl_moe_mxfp(setup_qwen3_vl_moe):
nsamples=2,
seqlen=32,
iters=1,
- fp_layers="self_attn,lm_head,mlp.gate",
+ ignore_layers="self_attn,lm_head,mlp.gate",
)
quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
assert quantized_model is not None, "Quantized model should not be None."
diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py
index 7e0600f05..8baaf110e 100644
--- a/test/test_cpu/test_mxfp_nvfp.py
+++ b/test/test_cpu/test_mxfp_nvfp.py
@@ -147,7 +147,7 @@ def test_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
), "Illegal MXFP4 packing name or data_type or shape"
assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers
skip_layer, "weight_packed"
- ), "Illegal MXFP4 quantization for fp_layers"
+ ), "Illegal MXFP4 quantization for ignore_layers"
quantization_config = AutoConfig.from_pretrained(
quantized_model_path, trust_remote_code=True
).quantization_config
@@ -187,7 +187,7 @@ def test_rtn_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader):
), "Illegal MXFP4 packing name or data_type or shape"
assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers
skip_layer, "weight_packed"
- ), "Illegal MXFP4 quantization for fp_layers"
+ ), "Illegal MXFP4 quantization for ignore_layers"
quantization_config = AutoConfig.from_pretrained(
quantized_model_path, trust_remote_code=True
).quantization_config
diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py
index f48902904..c42085317 100644
--- a/test/test_cuda/test_export.py
+++ b/test/test_cuda/test_export.py
@@ -58,7 +58,7 @@ def test_autogptq_format(self, dataloader):
shutil.rmtree("./saved", ignore_errors=True)
@require_optimum
- def test_autogptq_format_fp_layers(self, tiny_opt_model_path, dataloader):
+ def test_autogptq_format_ignore_layers(self, tiny_opt_model_path, dataloader):
layer_config = {}
model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path)
tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path)
@@ -95,7 +95,7 @@ def test_autogptq_format_fp_layers(self, tiny_opt_model_path, dataloader):
# "there there there there there there")
shutil.rmtree("./saved", ignore_errors=True)
- def test_autogptq_format_qsave_fp_layers(self, dataloader):
+ def test_autogptq_format_qsave_ignore_layers(self, dataloader):
model_path = get_model_path("facebook/opt-125m")
model = AutoModelForCausalLM.from_pretrained(model_path)
diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py
index ac8b8b91e..bbd442aca 100644
--- a/test/test_cuda/test_main_func.py
+++ b/test/test_cuda/test_main_func.py
@@ -91,7 +91,7 @@ def test_backend_awq(self):
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@require_gptqmodel
- def test_fp_layers(self):
+ def test_ignore_layers(self):
model_name = get_model_path("facebook/opt-125m")
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
@@ -116,7 +116,7 @@ def test_fp_layers(self):
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@require_awq
@require_package_version_ut("transformers", "<4.57.0")
- def test_fp_layers_awq(self):
+ def test_ignore_layers_awq(self):
model_name = get_model_path("facebook/opt-125m")
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained(model_name)
diff --git a/test/test_cuda/test_moe_model.py b/test/test_cuda/test_moe_model.py
index 8c21abbdf..78c5c07cf 100644
--- a/test/test_cuda/test_moe_model.py
+++ b/test/test_cuda/test_moe_model.py
@@ -62,7 +62,7 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0):
nsamples=2,
iters=iters,
seqlen=32,
- fp_layers="self_attn,router,lm_head,mlp.gate",
+ ignore_layers="self_attn,router,lm_head,mlp.gate",
)
quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
return quantized_model
@@ -177,7 +177,7 @@ def test_qwen3_vl_moe_mxfp(setup_qwen3_vl_moe):
nsamples=2,
seqlen=32,
iters=1,
- fp_layers="self_attn,lm_head,mlp.gate",
+ ignore_layers="self_attn,lm_head,mlp.gate",
)
quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir)
assert quantized_model is not None, "Quantized model should not be None."