Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions auto_round/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def __init__(self, *args, **kwargs):
basic.add_argument("--low_cpu_mem_usage", action="store_true", help="Lower CPU memory mode. Defaults to False.")
basic.add_argument(
"--format",
"--formats",
default="auto_round",
type=str,
help="Output format for the quantized model."
Expand Down
151 changes: 49 additions & 102 deletions auto_round/compressors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,39 @@
)
from auto_round.wrapper import WrapperLinear, WrapperMultiblock, unwrapper_block, unwrapper_layer, wrapper_block

SERIALIZATION_KEYS = [
"bits",
"act_bits",
"data_type",
"act_data_type",
"group_size",
"act_group_size",
"sym",
"act_sym",
"act_dynamic",
"amp",
"batch_size",
"enable_minmax_tuning",
"enable_norm_bias_tuning",
"enable_quanted_input",
"gradient_accumulate_steps",
"iters",
"lr",
"low_gpu_mem_usage",
"minmax_lr",
"nsamples",
"quant_block_list",
"regex_config",
"scale_dtype",
"seqlen",
"supported_types",
"static_attention_dtype",
"static_kv_dtype",
"super_bits",
"super_group_size",
"to_quant_block_names",
]


class BaseCompressor(object):
"""Base compressor for LLM quantization
Expand Down Expand Up @@ -1105,35 +1138,17 @@ def _quantize_layer_via_rtn(self, name: str, dtype: torch.dtype = None, to_cpu=T
def _immediate_pack(self, name: str):
if not self.immediate_packing:
return
m = get_module(self.model, name)
if not check_to_quantized(m):
return
from auto_round.export import PACKING_LAYER_WITH_FORMAT

target_backend = self.formats[0].output_format
has_gguf = any(fmt.is_gguf() for fmt in self.formats)

if has_gguf:
from auto_round.export.export_to_gguf.export import pack_gguf_layer

output_dir = self._get_save_folder_name(self.formats[0])
model_type = ModelType.MMPROJ if self.mllm else ModelType.TEXT
pack_gguf_layer(
name,
self.model,
self.formats[0].get_backend_name(),
output_dir,
self.layer_config,
self.tokenizer,
processor=self.processor if hasattr(self, "processor") else None,
image_processor=self.image_processor if hasattr(self, "image_processor") else None,
model_type=model_type,
device=self.device,
)
else:
PACKING_LAYER_WITH_FORMAT[target_backend](
name, self.model, self.formats[0].get_backend_name(), device=self.device
)
self.formats[0].immediate_pack(
name=name,
model=self.model,
device=self.device,
output_dir=self._get_save_folder_name(self.formats[0]),
mllm=self.mllm,
layer_config=self.layer_config,
tokenizer=self.tokenizer,
processor=self.processor if hasattr(self, "processor") else None,
image_processor=self.image_processor if hasattr(self, "image_processor") else None,
)

@torch.inference_mode()
def _quantize_rtn(self) -> tuple[torch.nn.Module, dict[str, Any]]:
Expand Down Expand Up @@ -2923,98 +2938,30 @@ def save_quantized(
folders = []
for format in formats:
save_folder = self._get_save_folder_name(format)
if format.is_fake(): # TODO fix act quantization later
self.model = self.model.to("cpu")
self.model.save_pretrained(output_dir)
if self.tokenizer is not None and hasattr(self.tokenizer, "save_pretrained"):
self.tokenizer.save_pretrained(output_dir)
processor = kwargs.get("processor", None)
if processor is not None:
processor.save_pretrained(output_dir)
try:
copy_python_files_from_model_cache(self.model, output_dir)
except Exception as e:
logger.warning("Skipping source model Python file copy due to error: %s", e)
compressed_model = self.model
continue
if self.act_bits <= 8 and format.is_fake():
logger.warning(
"Support for exporting activation quantization is limited. "
"Please ensure that your configuration is supported."
)
from auto_round.export import EXPORT_FORMAT

backend = format.get_backend_name()
output_format = format.output_format
if output_format not in EXPORT_FORMAT:
raise ValueError(f"export format only supports {EXPORT_FORMAT.keys()}, but got {output_format}")
save_quantized_as_format = EXPORT_FORMAT.get(output_format)
serialization_keys = [
"bits",
"group_size",
"sym",
"data_type",
"enable_quanted_input",
"enable_minmax_tuning",
"seqlen",
"batch_size",
"scale_dtype",
"lr",
"minmax_lr",
"gradient_accumulate_steps",
"iters",
"amp",
"nsamples",
"low_gpu_mem_usage",
"to_quant_block_names",
"enable_norm_bias_tuning",
"act_bits",
"act_group_size",
"act_sym",
"act_dynamic",
"act_data_type",
"super_bits",
"super_group_size",
"regex_config",
"static_kv_dtype",
"static_attention_dtype",
]

if isinstance(self.dataset, str):
serialization_keys.append("dataset")
SERIALIZATION_KEYS.append("dataset")
serialization_dict = {}
for key in serialization_keys:
for key in SERIALIZATION_KEYS:
serialization_dict[key] = getattr(self, key)
from auto_round.version import __version__

serialization_dict["autoround_version"] = __version__
if "scale_dtype" in serialization_dict.keys():
serialization_dict["scale_dtype"] = str(serialization_dict["scale_dtype"])
compressed_model = save_quantized_as_format( # TODO refine the code
compressed_model = format.save_quantized(
save_folder,
model=self.model,
layer_config=self.layer_config,
inplace=inplace,
bits=self.bits,
act_bits=self.act_bits,
group_size=self.group_size,
sym=self.sym,
iters=self.iters,
lr=self.lr,
minmax_lr=self.minmax_lr,
enable_minmax_tuning=self.enable_minmax_tuning,
enable_quanted_input=self.enable_quanted_input,
scale_dtype=self.scale_dtype,
tokenizer=self.tokenizer,
supported_types=self.supported_types,
data_type=self.data_type,
act_data_type=self.act_data_type,
serialization_dict=serialization_dict,
backend=backend,
to_quant_block_names=self.to_quant_block_names,
quant_block_list=self.quant_block_list,
device=self.device,
static_kv_dtype=self.static_kv_dtype,
static_attention_dtype=self.static_attention_dtype,
serialization_dict=serialization_dict,
**kwargs,
)
folders.append(save_folder)
Expand Down
79 changes: 0 additions & 79 deletions auto_round/export/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,82 +11,3 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from auto_round.export.register import EXPORT_FORMAT, PACKING_LAYER_WITH_FORMAT, register_format, register_layer_packing


@register_format("auto_gptq")
def _save_quantized_as_autogptq(*args, **kwargs):
from auto_round.export.export_to_autogptq.export import save_quantized_as_autogptq

return save_quantized_as_autogptq(*args, **kwargs)


@register_format("itrex")
def _save_quantized_as_itrex(*args, **kwargs):
from auto_round.export.export_to_itrex.export import save_quantized_as_itrex

return save_quantized_as_itrex(*args, **kwargs)


@register_format("itrex_xpu")
def _save_quantized_as_itrex_xpu(*args, **kwargs):
from auto_round.export.export_to_itrex.export import save_quantized_as_itrex_xpu

return save_quantized_as_itrex_xpu(*args, **kwargs)


@register_format("auto_round")
def _save_quantized_as_autoround(*args, **kwargs):
from auto_round.export.export_to_autoround.export import save_quantized_as_autoround

return save_quantized_as_autoround(*args, **kwargs)


@register_format("auto_awq")
def _save_quantized_as_autoawq(*args, **kwargs):
from auto_round.export.export_to_awq.export import save_quantized_as_autoawq

return save_quantized_as_autoawq(*args, **kwargs)


@register_format("gguf")
def _save_quantized_as_gguf(*args, **kwargs):
from auto_round.export.export_to_gguf.export import save_quantized_as_gguf

return save_quantized_as_gguf(*args, **kwargs)


@register_layer_packing("auto_round")
def _packing_layer_with_autoround(*args, **kwargs):
from auto_round.export.export_to_autoround.export import pack_layer

return pack_layer(*args, **kwargs)


@register_layer_packing("auto_gptq")
def _packing_layer_with_autogptq(*args, **kwargs):
from auto_round.export.export_to_autogptq.export import pack_layer

return pack_layer(*args, **kwargs)


@register_layer_packing("auto_awq")
def _packing_layer_with_autoawq(*args, **kwargs):
from auto_round.export.export_to_awq.export import pack_layer

return pack_layer(*args, **kwargs)


@register_format("llm_compressor")
def _save_quantized_as_llmcompressor(*args, **kwargs):
from auto_round.export.export_to_llmcompressor.export import save_quantized_as_llmcompressor

return save_quantized_as_llmcompressor(*args, **kwargs)


@register_layer_packing("llm_compressor")
def _packing_layer_with_llmcompressor(*args, **kwargs):
from auto_round.export.export_to_llmcompressor.export import pack_layer

return pack_layer(*args, **kwargs)
22 changes: 14 additions & 8 deletions auto_round/export/export_to_autogptq/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import os
from concurrent.futures import ThreadPoolExecutor
from dataclasses import fields
from typing import Any, Dict
from typing import Any, Callable, Dict, Union

import threadpoolctl as tctl

Expand Down Expand Up @@ -190,18 +190,24 @@ def pack_layer(name, model, backend, device=None):
release_layer_safely(layer)


def save_quantized_as_autogptq(output_dir, inplace=True, backend="auto_gptq:exllamav2", **kwargs):
def save_quantized_as_autogptq(
output_dir: str,
model: torch.nn.Module = None,
tokenizer: Callable = None,
layer_config: dict = None,
inplace: bool = True,
device: Union[str, torch.device] = "cpu",
backend: str = "auto_gptq:exllamav2",
serialization_dict: dict = None,
**kwargs,
) -> torch.nn.Module:
"""Export the model to autogptq format to easily leverage cuda kernel."""

# --- 1️⃣ Extract inputs & configs ---
model = kwargs["model"]
quantization_config = kwargs["serialization_dict"]
layer_config = kwargs["layer_config"]
quant_block_list = kwargs.get("quant_block_list", get_block_names(model))
tokenizer = kwargs.get("tokenizer")
quantization_config = serialization_dict
quant_block_list = serialization_dict.get("quant_block_list", get_block_names(model))
processor = kwargs.get("processor")
image_processor = kwargs.get("image_processor")
device = kwargs.get("device")
safe_serialization = kwargs.get("safe_serialization", True)

# --- Save metadata (tokenizer, processor, etc.) ---
Expand Down
Loading
Loading