diff --git a/README.md b/README.md index 1e5b5c30e..f24f28ddf 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ AutoRound is an advanced quantization toolkit designed for Large Language Models (LLMs) and Vision-Language Models (VLMs). It achieves high accuracy at ultra-low bit widths (2–4 bits) with minimal tuning by leveraging **sign-gradient descent** and providing broad hardware compatibility. -See our [paper](https://arxiv.org/pdf/2309.05516) for more details. For usage instructions, please refer to the [User Guide](./docs/step_by_step.md). +See our papers [SignRoundV1](https://arxiv.org/pdf/2309.05516) and [SignRoundV2](http://arxiv.org/abs/2512.04746) for more details. For usage instructions, please refer to the [User Guide](./docs/step_by_step.md).

AutoRound Overview @@ -39,7 +39,7 @@ See our [paper](https://arxiv.org/pdf/2309.05516) for more details. For usage in * [2025/10] AutoRound has been integrated into **SGLang**: [*Usage*](https://docs.sglang.io/advanced_features/quantization.html#using-auto-round), [*LMSYS Blog*](https://lmsys.org/blog/2025-11-13-AutoRound/), [*X post*](https://x.com/lmsysorg/status/1991977019220148650?s=20), [*Intel blog*](https://community.intel.com/t5/Blogs/Tech-Innovation/Artificial-Intelligence-AI/AutoRound-Meets-SGLang-Enabling-Quantized-Model-Inference-with/post/1727196), [*Linkedin*](https://www.linkedin.com/feed/update/urn:li:activity:7397742859354857472). -* [2025/10] A **mix precision** algorithm is available to generate schemes in minutes: [*Usage*](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md#autoscheme), [*Accuracy*](./docs/auto_scheme_acc.md). +* [2025/10] A **mixed precision** algorithm is available to generate schemes in minutes: [*Usage*](https://github.com/intel/auto-round/blob/main/docs/step_by_step.md#autoscheme), [*Accuracy*](./docs/auto_scheme_acc.md). * [2025/09] **MXFP4** and **NVFP4** dtypes is available: [*Accuracy*](./docs/mxnv_acc.md). @@ -339,8 +339,15 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0])) ## Publications & Events +[SignRoundV2: Closing the Performance Gap in Extremely Low-Bit Post-Training Quantization for LLMs](https://arxiv.org/abs/2512.04746) (202512 paper) -[Publication List](./docs/publication_list.md). +[Optimize Weight Rounding via Signed Gradient Descent for the Quantization of LLM](https://aclanthology.org/2024.findings-emnlp.662/) (202309 paper) + +[TEQ: Trainable Equivalent Transformation for Quantization of LLMs](https://arxiv.org/abs/2310.10944) (202310 paper) + +[Effective Post-Training Quantization for Large Language Models](https://medium.com/intel-analytics-software/effective-post-training-quantization-for-large-language-models-with-enhanced-smoothquant-approach-93e9d104fb98) (202304 blog) + +Check out [Full Publication List](./docs/publication_list.md). ## Acknowledgement Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound. diff --git a/auto_round/__main__.py b/auto_round/__main__.py index d127cf571..adbd8fe02 100644 --- a/auto_round/__main__.py +++ b/auto_round/__main__.py @@ -303,6 +303,7 @@ def __init__(self, *args, **kwargs): help="Quantize the lm_head. " "Usually kept in higher precision for better output quality.", ) scheme.add_argument( + "--ignore_layers", "--fp_layers", default="", type=str, @@ -599,7 +600,7 @@ def tune(args): super_bits=args.super_bits, super_group_size=args.super_group_size, quant_lm_head=args.quant_lm_head, - fp_layers=args.fp_layers, + ignore_layers=args.ignore_layers, static_kv_dtype=args.static_kv_dtype, static_attention_dtype=args.static_attention_dtype, ) diff --git a/auto_round/autoround.py b/auto_round/autoround.py index a15919cfb..050281983 100644 --- a/auto_round/autoround.py +++ b/auto_round/autoround.py @@ -82,9 +82,7 @@ def __new__( device_map: Union[str, torch.device, int, dict] = 0, enable_torch_compile: bool = False, seed: int = 42, - # for adam enable_adam: bool = False, - # for MLLM and Diffusion extra_config: ExtraConfig = None, enable_alg_ext: bool = None, disable_opt_rtn: bool = None, @@ -96,36 +94,42 @@ def __new__( Args: model (torch.nn.Module | str): Model object or model name to load. tokenizer: Tokenizer for text processing. Required if `model` is not a string and `iters > 0`. + platform: The platform to download pretrained model, options: ["hf", "model_scope"] scheme (str| dict | QuantizationScheme ): A preset scheme that defines the quantization configurations - bits (int, optional): Weight quantization bits. Defaults to 4. - group_size (int, optional): Weight quantization group size. Defaults to 128. - sym (bool, optional): Symmetric weight quantization. Defaults to True. layer_config (dict, optional): Layer-wise quantization config. Defaults to None. - batch_size (int, optional): Calibration batch size. Defaults to 8. - amp (bool, optional): Use AMP for tuning. Defaults to True. - device (str | torch.device | int, optional): Compute device. Defaults to 0. dataset (str | list | tuple | DataLoader, optional): Calibration data. Defaults to "NeelNanda/pile-10k". - enable_minmax_tuning (bool, optional): Enable weight min-max tuning. Defaults to True. - lr (float, optional): Learning rate; if None, set to 1.0 / iters except when iters==0. - minmax_lr (float, optional): Learning rate for min-max tuning; defaults to `lr`. - low_gpu_mem_usage (bool, optional): Lower GPU memory mode. Defaults to False. - low_cpu_mem_usage (bool, optional): Lower CPU memory mode. Defaults to False. iters (int, optional): Optimization iterations. Defaults to 200. seqlen (int, optional): Calibration sequence length. Defaults to 2048. nsamples (int, optional): Number of calibration samples. Defaults to 128. - seed (int, optional): Random seed. Defaults to 42. + batch_size (int, optional): Calibration batch size. Defaults to 8. gradient_accumulate_steps (int, optional): Gradient accumulation steps. Defaults to 1. + low_gpu_mem_usage (bool, optional): Lower GPU memory mode. Defaults to False. + device_map (str | dict, optional): Device map for each module. Defaults to 0. + enable_torch_compile (bool, optional): Enable torch.compile for low cost in quantization. Defaults to False. + seed (int, optional): Random seed. Defaults to 42. + enable_adam (bool, optional): Enable Adam-based optimizer. Defaults to False. + extra_config(ExtraConfig, optional): Extra configuration for lots of configurations. Defaults to None. + enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2) + for better accuracy. Defaults to False. + disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0) for fast quatnziation + with lower accuracy. Defaults to False. + low_cpu_mem_usage (bool, optional): Lower CPU memory mode. Defaults to False. + + bits (int, optional): Weight quantization bits. Defaults to 4. + group_size (int, optional): Weight quantization group size. Defaults to 128. + sym (bool, optional): Symmetric weight quantization. Defaults to True. data_type (str, optional): Weight data type string, e.g., "int". Defaults to "int". act_bits (int, optional): Activation quantization bits. Defaults to 16. act_group_size (int, optional): Activation group size. Defaults to None. act_sym (bool, optional): Symmetric activation quantization. Defaults to None. act_data_type (str, optional): Activation data type; inherits weight dtype if None and act_bits < 16. act_dynamic (bool, optional): Dynamic activation quantization. Defaults to True. - enable_torch_compile (bool, optional): Enable torch.compile for quant blocks/layers. Defaults to False. - device_map (str | dict, optional): Device placement map. Defaults to None. - disable_opt_rtn (bool, optional): Disable RTN-mode optimization (iters=0). Defaults to False. - enable_alg_ext (bool, optional): Enable algorithm extension (primarily for INT2). Defaults to False. model_dtype (str): model dtype used to load pre-trained model. + amp (bool, optional): Use AMP for tuning. Defaults to True. + enable_minmax_tuning (bool, optional): Enable weight min-max tuning. Defaults to True. + lr (float, optional): Learning rate; if None, set to 1.0 / iters except when iters==0. + minmax_lr (float, optional): Learning rate for min-max tuning; defaults to `lr`. + **kwargs: Backward compatible options: - enable_alg_ext, quant_lm_head, lr, lr_scheduler, sampler, not_use_best_mse, dynamic_max_gap, super_group_size, super_bits, scale_dtype ("fp16" etc.), @@ -140,18 +144,17 @@ def __new__( >>> layer_config = { ... "layer1": { - ... "data_type": "int", - ... "bits": 4, + ... "bits": 3, ... "group_size": 128, ... "sym": True, - ... "act_data_type": None, - ... "act_bits": 16, - ... "act_group_size": None, - ... "act_sym": None, ... }, + ... "layer2": { + ... "W8A16" + ... } ... # ... ... } """ + model_cls = [] if (extra_config and not extra_config.mllm_config.is_default()) or is_mllm_model(model, platform=platform): @@ -179,6 +182,11 @@ def __new__( kwargs["enable_alg_ext"] = enable_alg_ext if disable_opt_rtn is not None: kwargs["disable_opt_rtn"] = disable_opt_rtn + if "fp_layers" in kwargs: + logger.warning_once( + "'fp_layers' is deprecated, please use 'ignore_layers' to set layers not to be quantized." + ) + kwargs["ignore"] = kwargs.pop("fp_layers") ar = dynamic_compressor( model=model, tokenizer=tokenizer, diff --git a/auto_round/compressors/base.py b/auto_round/compressors/base.py index 8464aa4fb..2665d74ef 100644 --- a/auto_round/compressors/base.py +++ b/auto_round/compressors/base.py @@ -312,7 +312,7 @@ def __init__( self.platform = platform self.quant_lm_head = kwargs.pop("quant_lm_head", False) - self.fp_layers = kwargs.pop("fp_layers", "") + self.ignore_layers = kwargs.pop("ignore_layers", "") self.supported_types = SUPPORTED_LAYER_TYPES self.inner_supported_types = INNER_SUPPORTED_LAYER_TYPES self.scale_dtype = convert_dtype_str2torch(scale_dtype) @@ -504,7 +504,7 @@ def _gen_auto_scheme( self.supported_types, self.inner_supported_types, self.quant_block_list, - self.fp_layers, + self.ignore_layers, self.quant_lm_head, enable_gguf_official_mixed=False, is_mllm=self.mllm, @@ -1398,7 +1398,7 @@ def configure_layer_config(self, enable_gguf_official_mixed: None | bool = True) self.supported_types, self.inner_supported_types, self.quant_block_list, - self.fp_layers, + self.ignore_layers, self.quant_lm_head, enable_gguf_official_mixed=enable_gguf_official_mixed, is_mllm=self.mllm, diff --git a/auto_round/compressors/config.py b/auto_round/compressors/config.py index 733ed6282..7c9398a91 100644 --- a/auto_round/compressors/config.py +++ b/auto_round/compressors/config.py @@ -58,7 +58,7 @@ def __init__( super_group_size: int = None, static_kv_dtype: Union[str, torch.dtype] = None, quant_lm_head: bool = False, - fp_layers: str = None, + ignore_layers: str = None, # mllm processor: Callable = None, image_processor: Callable = None, @@ -139,7 +139,7 @@ def __init__( super_group_size=super_group_size, static_kv_dtype=static_kv_dtype, quant_lm_head=quant_lm_head, - fp_layers=fp_layers, + ignore_layers=ignore_layers, ) self.mllm_config = MLLMExtraConfig( processor=processor, @@ -277,7 +277,7 @@ class SchemeExtraConfig(BaseExtraConfig): static_kv_dtype: Union[str, torch.dtype] = None static_attention_dtype: Union[str, torch.dtype] = None quant_lm_head: bool = False - fp_layers: str = None + ignore_layers: str = None @dataclass diff --git a/auto_round/compressors/utils.py b/auto_round/compressors/utils.py index 266a90198..b5ea27dea 100644 --- a/auto_round/compressors/utils.py +++ b/auto_round/compressors/utils.py @@ -211,7 +211,7 @@ def set_layer_config( supported_types: tuple, inner_supported_types: tuple, quant_block_list=None, - fp_layers: str = "", + ignore_layers: str = "", quant_lm_head: bool = False, enable_gguf_official_mixed: bool = True, is_mllm: bool = False, @@ -261,8 +261,8 @@ def normalize_item(item: Union[str, dict, "QuantizationScheme"], layer_name: str scheme_keys = tuple(f.name for f in fields(QuantizationScheme)) + ("scale_dtype",) layer_config = copy.deepcopy(layer_config) or {} - # 1. fp_layers -> force 16 - for name in get_fp_layer_names(model, fp_layers): + # 1. ignore_layers -> force 16 + for name in get_fp_layer_names(model, ignore_layers): layer_config[name] = { "bits": 16, "act_bits": 16, @@ -852,7 +852,7 @@ def _set_config(config, target_config): return layer_config, gguf_format_config -def get_fp_layer_names(model: torch.nn.Module, fp_layers: str): +def get_fp_layer_names(model: torch.nn.Module, ignore_layers: str): """Identifies and returns layers in the model to exclude from quantization. This function processes a comma-separated list of fully precision (FP) layers, @@ -861,7 +861,7 @@ def get_fp_layer_names(model: torch.nn.Module, fp_layers: str): Args: model (torch.nn.Module): The model whose layers will be inspected. - fp_layers (str): A comma-separated string of layer names to be excluded + ignore_layers (str): A comma-separated string of layer names to be excluded from quantization. Whitespace is ignored in this string. Returns: @@ -870,16 +870,16 @@ def get_fp_layer_names(model: torch.nn.Module, fp_layers: str): """ from auto_round.utils import SUPPORTED_LAYER_TYPES - if not fp_layers: + if not ignore_layers: return [] - fp_layers = fp_layers.replace(" ", "").split(",") + ignore_layers = ignore_layers.replace(" ", "").split(",") all_layer_names = [] for n, m in model.named_modules(): if type(m) in SUPPORTED_LAYER_TYPES: all_layer_names.append(n) not_to_quantized_layers = [] - for fp_layer in fp_layers: + for fp_layer in ignore_layers: if fp_layer == "": continue if fp_layer in all_layer_names: diff --git a/auto_round/export/export_to_autoround/export.py b/auto_round/export/export_to_autoround/export.py index a5176c1db..52b35f9a4 100644 --- a/auto_round/export/export_to_autoround/export.py +++ b/auto_round/export/export_to_autoround/export.py @@ -288,7 +288,7 @@ def save_quantized_as_autoround( neq_keys = check_neq_config(cfg, **{k: quantization_config[k] for k in scheme_keys}) if len(neq_keys) > 0: extra_config[layer_name] = {} - for key in scheme_keys: + for key in neq_keys: if cfg.get(key) is not None: extra_config[layer_name][key] = cfg[key] diff --git a/docs/tips_and_tricks.md b/docs/tips_and_tricks.md index 8d490b773..3f9177fc2 100644 --- a/docs/tips_and_tricks.md +++ b/docs/tips_and_tricks.md @@ -135,7 +135,7 @@ cause the outputs to become excessively large. Even when a fake QDQ model is fin with the INT4 kernel. **Suggestion: It is recommended not to quantize layers with large output values**. In AutoRound (>=0.4), you can -use `--fp_layers "xxx,xxx"` to exclude these layers. +use `--ignore_layers "xxx,xxx"` to exclude these layers. **Reasoning**: While adjusting the quantization configuration (symmetric/asymmetric) or using `clamp_to_range` may provide some benefit, configuration tuning can be tedious, and `clamp_to_range` is not always effective. Therefore, it diff --git a/test/test_cpu/test_act_quantization.py b/test/test_cpu/test_act_quantization.py index 6e9152ab2..47cda3599 100644 --- a/test/test_cpu/test_act_quantization.py +++ b/test/test_cpu/test_act_quantization.py @@ -173,15 +173,7 @@ def test_WOQ_config_INT_saving(self, tiny_opt_model_path, dataloader): # check inblock layer config values kproj_config = extra_config["model.decoder.layers.1.self_attn.k_proj"] - assert "act_data_type" in kproj_config.keys() and kproj_config["act_data_type"] == "float" - assert "act_bits" in kproj_config.keys() and kproj_config["act_bits"] == 16 - assert "act_group_size" in kproj_config.keys() and kproj_config["act_group_size"] == 128 - assert "act_sym" in kproj_config.keys() and not kproj_config["act_sym"] - assert "data_type" in kproj_config.keys() and kproj_config["data_type"] == "int" assert "bits" in kproj_config.keys() and kproj_config["bits"] == 8 - assert "group_size" in kproj_config.keys() and kproj_config["group_size"] == 128 - assert "sym" in kproj_config.keys() and not kproj_config["sym"] - assert "act_dynamic" in kproj_config.keys() and kproj_config["act_dynamic"] shutil.rmtree(quantized_model_path, ignore_errors=True) def test_act_config_FP8_saving(self, tiny_opt_model_path, dataloader): @@ -214,12 +206,6 @@ def test_act_config_FP8_saving(self, tiny_opt_model_path, dataloader): # check inblock layer config values kproj_config = extra_config["model.decoder.layers.0.self_attn.k_proj"] - assert "act_data_type" in kproj_config.keys() and kproj_config["act_data_type"] == "fp" assert "act_bits" in kproj_config.keys() and kproj_config["act_bits"] == 16 - assert "act_group_size" in kproj_config.keys() and kproj_config["act_group_size"] == 0 - assert "act_sym" in kproj_config.keys() and kproj_config["act_sym"] - assert "data_type" in kproj_config.keys() and kproj_config["data_type"] == "fp" - assert "bits" in kproj_config.keys() and kproj_config["bits"] == 8 assert "group_size" in kproj_config.keys() and kproj_config["group_size"] == 0 - assert "sym" in kproj_config.keys() and kproj_config["sym"] shutil.rmtree(quantized_model_path, ignore_errors=True) diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py index 92e9d620e..4b89a3a66 100644 --- a/test/test_cpu/test_gguf_format.py +++ b/test/test_cpu/test_gguf_format.py @@ -230,7 +230,7 @@ def test_qtype_setting(self): ar.supported_types, ar.inner_supported_types, ar.quant_block_list, - ar.fp_layers, + ar.ignore_layers, ar.quant_lm_head, enable_gguf_official_mixed=True, is_mllm=ar.mllm, @@ -249,7 +249,7 @@ def test_qtype_setting(self): ar.supported_types, ar.inner_supported_types, ar.quant_block_list, - ar.fp_layers, + ar.ignore_layers, ar.quant_lm_head, enable_gguf_official_mixed=True, is_mllm=ar.mllm, @@ -271,7 +271,7 @@ def test_qtype_setting(self): ar.supported_types, ar.inner_supported_types, ar.quant_block_list, - ar.fp_layers, + ar.ignore_layers, ar.quant_lm_head, enable_gguf_official_mixed=True, is_mllm=ar.mllm, diff --git a/test/test_cpu/test_moe_alignment.py b/test/test_cpu/test_moe_alignment.py index b3dfe5a61..3e689a7e0 100644 --- a/test/test_cpu/test_moe_alignment.py +++ b/test/test_cpu/test_moe_alignment.py @@ -41,7 +41,7 @@ def test_moe_scale_alignment_fp8_static(setup_deepseek_v2_lite): nsamples=4, iters=0, # RTN for faster testing seqlen=32, - fp_layers="self_attn,lm_head", + ignore_layers="self_attn,lm_head", ) quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) diff --git a/test/test_cpu/test_moe_model.py b/test/test_cpu/test_moe_model.py index d60d975fb..397e74820 100644 --- a/test/test_cpu/test_moe_model.py +++ b/test/test_cpu/test_moe_model.py @@ -65,7 +65,7 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0): nsamples=2, iters=iters, seqlen=32, - fp_layers="self_attn,router,lm_head,mlp.gate", + ignore_layers="self_attn,router,lm_head,mlp.gate", ) quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) return quantized_model @@ -154,7 +154,7 @@ def test_qwen3_vl_moe_mxfp(setup_qwen3_vl_moe): nsamples=2, seqlen=32, iters=1, - fp_layers="self_attn,lm_head,mlp.gate", + ignore_layers="self_attn,lm_head,mlp.gate", ) quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) assert quantized_model is not None, "Quantized model should not be None." diff --git a/test/test_cpu/test_mxfp_nvfp.py b/test/test_cpu/test_mxfp_nvfp.py index 7e0600f05..8baaf110e 100644 --- a/test/test_cpu/test_mxfp_nvfp.py +++ b/test/test_cpu/test_mxfp_nvfp.py @@ -147,7 +147,7 @@ def test_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): ), "Illegal MXFP4 packing name or data_type or shape" assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers skip_layer, "weight_packed" - ), "Illegal MXFP4 quantization for fp_layers" + ), "Illegal MXFP4 quantization for ignore_layers" quantization_config = AutoConfig.from_pretrained( quantized_model_path, trust_remote_code=True ).quantization_config @@ -187,7 +187,7 @@ def test_rtn_mxfp4_llmcompressor_format(self, tiny_opt_model_path, dataloader): ), "Illegal MXFP4 packing name or data_type or shape" assert not hasattr(skip_layer, "weight_scale") and not hasattr( ## check skipped layers skip_layer, "weight_packed" - ), "Illegal MXFP4 quantization for fp_layers" + ), "Illegal MXFP4 quantization for ignore_layers" quantization_config = AutoConfig.from_pretrained( quantized_model_path, trust_remote_code=True ).quantization_config diff --git a/test/test_cuda/test_export.py b/test/test_cuda/test_export.py index f48902904..c42085317 100644 --- a/test/test_cuda/test_export.py +++ b/test/test_cuda/test_export.py @@ -58,7 +58,7 @@ def test_autogptq_format(self, dataloader): shutil.rmtree("./saved", ignore_errors=True) @require_optimum - def test_autogptq_format_fp_layers(self, tiny_opt_model_path, dataloader): + def test_autogptq_format_ignore_layers(self, tiny_opt_model_path, dataloader): layer_config = {} model = AutoModelForCausalLM.from_pretrained(tiny_opt_model_path) tokenizer = AutoTokenizer.from_pretrained(tiny_opt_model_path) @@ -95,7 +95,7 @@ def test_autogptq_format_fp_layers(self, tiny_opt_model_path, dataloader): # "there there there there there there") shutil.rmtree("./saved", ignore_errors=True) - def test_autogptq_format_qsave_fp_layers(self, dataloader): + def test_autogptq_format_qsave_ignore_layers(self, dataloader): model_path = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_path) diff --git a/test/test_cuda/test_main_func.py b/test/test_cuda/test_main_func.py index ac8b8b91e..bbd442aca 100644 --- a/test/test_cuda/test_main_func.py +++ b/test/test_cuda/test_main_func.py @@ -91,7 +91,7 @@ def test_backend_awq(self): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @require_gptqmodel - def test_fp_layers(self): + def test_ignore_layers(self): model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) @@ -116,7 +116,7 @@ def test_fp_layers(self): @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") @require_awq @require_package_version_ut("transformers", "<4.57.0") - def test_fp_layers_awq(self): + def test_ignore_layers_awq(self): model_name = get_model_path("facebook/opt-125m") model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto") tokenizer = AutoTokenizer.from_pretrained(model_name) diff --git a/test/test_cuda/test_moe_model.py b/test/test_cuda/test_moe_model.py index 8c21abbdf..78c5c07cf 100644 --- a/test/test_cuda/test_moe_model.py +++ b/test/test_cuda/test_moe_model.py @@ -62,7 +62,7 @@ def quantize_model(model, tokenizer, output_dir, scheme, iters=0): nsamples=2, iters=iters, seqlen=32, - fp_layers="self_attn,router,lm_head,mlp.gate", + ignore_layers="self_attn,router,lm_head,mlp.gate", ) quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) return quantized_model @@ -177,7 +177,7 @@ def test_qwen3_vl_moe_mxfp(setup_qwen3_vl_moe): nsamples=2, seqlen=32, iters=1, - fp_layers="self_attn,lm_head,mlp.gate", + ignore_layers="self_attn,lm_head,mlp.gate", ) quantized_model, _ = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) assert quantized_model is not None, "Quantized model should not be None."