Skip to content
97 changes: 53 additions & 44 deletions auto_round/export/export_to_gguf/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -229,51 +229,45 @@ def _quant_data(cls, data_torch, data_qtype, name, modify_name, new_name, bid, d
"""
suffix = ".weight"
device = data_torch.device if device is None else device
if suffix in name:

if name.endswith(suffix):
layer_name = name[: -len(suffix)]
module = get_module(cls.model, layer_name)
kwargs = {
"scale": None,
"zp": None,
"d_scale": None,
"d_wmin": None,
"wmin": None,
"imatrix": None,
}
if hasattr(module, "scale"):
else:
layer_name = name
module = get_module(cls.model, layer_name)
kwargs = {
"scale": None,
"zp": None,
"d_scale": None,
"d_wmin": None,
"wmin": None,
"imatrix": None,
}
# support for MOE model with cls eexperts not linear
# if hasattr(module, "scale") or ("exps" in new_name and len(data_torch.shape) == 3):
for attr in ["scale", "zp", "w_d_scale", "w_d_wmin", "w_wmin"]:
if hasattr(module, attr) and getattr(module, attr) is not None:
attr_tensor = getattr(module, attr)
if not isinstance(attr_tensor, torch.Tensor):
continue
if hasattr(cls, "permute"):
bs = module.weight.shape[0]
for attr in ["scale", "zp", "w_d_scale", "w_d_wmin", "w_wmin"]:
if hasattr(module, attr) and getattr(module, attr) is not None:
attr_tensor = getattr(module, attr)
if not isinstance(attr_tensor, torch.Tensor):
continue
attr_tensors_dict = dict(cls.modify_tensors(attr_tensor.reshape(bs, -1), modify_name, bid))
attr_tensor = attr_tensors_dict[new_name]
if attr in kwargs:
kwargs[attr] = attr_tensor.to(torch.float32)
else:
kwargs[attr.replace("w_", "")] = attr_tensor.to(torch.float32)
data_torch = data_torch.to(torch.float32)
attr_tensors_dict = dict(cls.modify_tensors(attr_tensor.reshape(bs, -1), modify_name, bid))
attr_tensor = attr_tensors_dict[new_name]
if attr in kwargs:
kwargs[attr] = attr_tensor.to(torch.float32)
else:
kwargs[attr.replace("w_", "")] = attr_tensor.to(torch.float32)
data_torch = data_torch.to(torch.float32)

data = ggml_quant(data_torch, data_qtype.name.lower(), device=device, **kwargs)
else:
# if data_torch.dtype ==torch.float32:
# data_qtype = gguf.GGMLQuantizationType.F32
# else:
# data_qtype = gguf.GGMLQuantizationType.F16
data_qtype = gguf.GGMLQuantizationType.F32 ##FP16 has issues at inference
data = data_torch.to(torch.float32).squeeze().cpu().numpy()
else:
# for Llama-4
# if data_torch.dtype == torch.float32:
# data_qtype = gguf.GGMLQuantizationType.F32
# else:
# data_qtype = gguf.GGMLQuantizationType.F16
# data = data_torch.squeeze().cpu().numpy()
# data_qtype = gguf.GGMLQuantizationType.F32
# data = data_torch.to(torch.float32).squeeze().cpu().numpy()
data = ggml_quant(data_torch, data_qtype.name.lower(), device=device)
data = ggml_quant(data_torch, data_qtype.name.lower(), device=device, **kwargs)
# else:
# # if data_torch.dtype ==torch.float32:
# # data_qtype = gguf.GGMLQuantizationType.F32
# # else:
# # data_qtype = gguf.GGMLQuantizationType.F16
# data_qtype = gguf.GGMLQuantizationType.F32 ##FP16 has issues at inference
# data = data_torch.to(torch.float32).squeeze().cpu().numpy()
return data, data_qtype


Expand Down Expand Up @@ -419,7 +413,9 @@ def prepare_tensors(cls):
break
if skip:
continue
data = data_torch.squeeze()
# sync with new version of gguf
# data = data_torch.squeeze()
data = data_torch
n_dims = len(data.shape)
data_qtype: gguf.GGMLQuantizationType | bool = cls.tensor_force_quant(name, new_name, bid, n_dims)

Expand Down Expand Up @@ -529,17 +525,30 @@ def prepare_tensors(cls):
elif data_qtype == gguf.GGMLQuantizationType.Q6_K:
data_qtype = gguf.GGMLQuantizationType.Q8_0

from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES

if data_qtype.name.lower() in GGML_QUANT_SIZES:
block_size, type_size = GGML_QUANT_SIZES[data_qtype.name.lower()]
if data_torch.shape[-1] % block_size != 0:
logger.warning(
f"{new_name}: Can't quantize tensor with shape {data_torch.shape} to {data_qtype.name},"
" fallback to F16"
)
data_qtype = gguf.GGMLQuantizationType.F16

if isinstance(data_qtype, bool) or data_qtype in [
gguf.GGMLQuantizationType.F16,
gguf.GGMLQuantizationType.BF16,
gguf.GGMLQuantizationType.F32,
]:
data = data_torch.squeeze().cpu().numpy()
# sync with new version of gguf
# data = data_torch.squeeze().cpu().numpy()

# if data ends up empty, it means data_torch was a scalar tensor -> restore
if len(data.shape) == 0:
if len(data_torch.shape) == 0:
data = data_torch.numpy()
try:
data = data_torch.cpu().numpy()
data = gguf.quants.quantize(data, data_qtype)
except gguf.QuantError as e:
logger.warning("%s, %s", e, "falling back to F16")
Expand Down
2 changes: 1 addition & 1 deletion auto_round/export/export_to_gguf/packing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@


def register_qtype(name):

def register(cls):
GGML_QUANT_TYPE[name] = cls
return cls
Expand Down Expand Up @@ -109,7 +110,6 @@ def ggml_quant(
else:
new_data = np.concatenate(results, axis=0)
new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size) # Check shape correctness
new_data = new_data.reshape(*shape[:-1], -1)
return new_data


Expand Down
12 changes: 12 additions & 0 deletions test/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,18 @@ def slice_layers(module):

if hasattr(model.config, "num_hidden_layers"):
model.config.num_hidden_layers = num_layers
if hasattr(model.config, "text_config"):
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]
for key in n_block_keys:
if hasattr(model.config.text_config, key):
setattr(model.config.text_config, key, num_layers)
if hasattr(model.config.text_config, "layer_types"):
model.config.text_config.layer_types = model.config.text_config.layer_types[:num_layers]
if hasattr(model.config, "vision_config"):
n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]
for key in n_block_keys:
if hasattr(model.config.vision_config, key):
setattr(model.config.vision_config, key, num_layers + 1)
if hasattr(model.config, "layer_types"):
model.config.layer_types = model.config.layer_types[:num_layers]

Expand Down
13 changes: 6 additions & 7 deletions test/test_cpu/export/test_gguf_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,18 +190,17 @@ def test_all_format(self, tiny_qwen_model_path):
shutil.rmtree("../../tmp_autoround", ignore_errors=True)

def test_vlm_gguf(self):
from ...helpers import save_tiny_model

model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
tiny_model_path = save_tiny_model(model_name, "./tmp/tiny_qwen_vl_model_path", num_layers=2, is_mllm=True)
from auto_round import AutoRoundMLLM
from auto_round.utils import mllm_load_model

model, processor, tokenizer, image_processor = mllm_load_model(model_name)
autoround = AutoRoundMLLM(
model,
tokenizer=tokenizer,
processor=processor,
image_processor=image_processor,
tiny_model_path,
iters=0,
nsamples=8,
disable_opt_rtn=True,
)
quantized_model_path = "./saved"
autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
Expand All @@ -211,7 +210,7 @@ def test_vlm_gguf(self):
if file_name == "mmproj-model.gguf":
assert abs(file_size - 2537) < 5.0
else:
assert abs(file_size - 892) < 5.0
assert abs(file_size - 238) < 5.0
shutil.rmtree("./saved", ignore_errors=True)

def test_qtype_setting(self):
Expand Down
52 changes: 26 additions & 26 deletions test/test_cuda/export/test_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,46 +140,46 @@ def test_all_format(self):
shutil.rmtree(self.save_dir, ignore_errors=True)

@require_gguf
def test_vlm_gguf(self):
model_name = "/models/Qwen2-VL-2B-Instruct"
from auto_round import AutoRoundMLLM
from auto_round.utils import mllm_load_model
def test_special_model(self):
from ...helpers import save_tiny_model

model, processor, tokenizer, image_processor = mllm_load_model(model_name)
autoround = AutoRoundMLLM(
model,
tokenizer=tokenizer,
processor=processor,
image_processor=image_processor,
device="auto",
model_name = get_model_path("ibm-granite/granite-4.0-h-tiny")
tiny_model_path = save_tiny_model(model_name, "tiny_model_path", num_layers=2)
from auto_round import AutoRound

autoround = AutoRound(
tiny_model_path,
iters=0,
nsamples=8,
disable_opt_rtn=True,
)
quantized_model_path = "./saved"
autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
assert "mmproj-model.gguf" in os.listdir("./saved")
file_size = os.path.getsize("./saved/Qwen2-VL-2B-Instruct-Q4_0.gguf") / 1024**2
assert abs(file_size - 894) < 5.0
file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
assert abs(file_size - 2580) < 5.0
file_name = os.listdir(quantized_model_path)[0]
file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
assert abs(file_size - 307) < 5.0
shutil.rmtree("./saved", ignore_errors=True)

model_name = "/models/gemma-3-12b-it"
@require_gguf
def test_vlm_gguf(self):
from ...helpers import save_tiny_model

model_name = "/models/gemma-3-4b-it"
tiny_model_path = save_tiny_model(model_name, "tiny_model_path", num_layers=2, is_mllm=True)
from auto_round import AutoRound

model, processor, tokenizer, image_processor = mllm_load_model(model_name)
autoround = AutoRoundMLLM(
model,
tokenizer=tokenizer,
processor=processor,
image_processor=image_processor,
autoround = AutoRound(
tiny_model_path,
device="auto",
nsamples=32,
iters=0,
disable_opt_rtn=True,
)
quantized_model_path = "./saved"
autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")
assert "mmproj-model.gguf" in os.listdir("./saved")
file_size = os.path.getsize("./saved/gemma-3-12B-it-Q4_K_M.gguf") / 1024**2
assert abs(file_size - 6568) < 5.0
file_size = os.path.getsize("./saved/tiny_model_path-860M-Q4_K_M.gguf") / 1024**2
assert abs(file_size - 639) < 5.0
file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
assert abs(file_size - 1599) < 5.0
assert abs(file_size - 75) < 5.0
shutil.rmtree(quantized_model_path, ignore_errors=True)
Loading