intel · n1ck-guo · Jan 12, 2026 · Jan 8, 2026 · Jan 9, 2026 · Jan 9, 2026
diff --git a/auto_round/export/export_to_gguf/convert.py b/auto_round/export/export_to_gguf/convert.py
@@ -229,51 +229,45 @@ def _quant_data(cls, data_torch, data_qtype, name, modify_name, new_name, bid, d
     """
     suffix = ".weight"
     device = data_torch.device if device is None else device
-    if suffix in name:
+
+    if name.endswith(suffix):
         layer_name = name[: -len(suffix)]
-        module = get_module(cls.model, layer_name)
-        kwargs = {
-            "scale": None,
-            "zp": None,
-            "d_scale": None,
-            "d_wmin": None,
-            "wmin": None,
-            "imatrix": None,
-        }
-        if hasattr(module, "scale"):
+    else:
+        layer_name = name
+    module = get_module(cls.model, layer_name)
+    kwargs = {
+        "scale": None,
+        "zp": None,
+        "d_scale": None,
+        "d_wmin": None,
+        "wmin": None,
+        "imatrix": None,
+    }
+    # support for MOE model with cls eexperts not linear
+    # if hasattr(module, "scale") or ("exps" in new_name and len(data_torch.shape) == 3):
+    for attr in ["scale", "zp", "w_d_scale", "w_d_wmin", "w_wmin"]:
+        if hasattr(module, attr) and getattr(module, attr) is not None:
+            attr_tensor = getattr(module, attr)
+            if not isinstance(attr_tensor, torch.Tensor):
+                continue
             if hasattr(cls, "permute"):
                 bs = module.weight.shape[0]
-                for attr in ["scale", "zp", "w_d_scale", "w_d_wmin", "w_wmin"]:
-                    if hasattr(module, attr) and getattr(module, attr) is not None:
-                        attr_tensor = getattr(module, attr)
-                        if not isinstance(attr_tensor, torch.Tensor):
-                            continue
-                        attr_tensors_dict = dict(cls.modify_tensors(attr_tensor.reshape(bs, -1), modify_name, bid))
-                        attr_tensor = attr_tensors_dict[new_name]
-                        if attr in kwargs:
-                            kwargs[attr] = attr_tensor.to(torch.float32)
-                        else:
-                            kwargs[attr.replace("w_", "")] = attr_tensor.to(torch.float32)
-            data_torch = data_torch.to(torch.float32)
+                attr_tensors_dict = dict(cls.modify_tensors(attr_tensor.reshape(bs, -1), modify_name, bid))
+                attr_tensor = attr_tensors_dict[new_name]
+            if attr in kwargs:
+                kwargs[attr] = attr_tensor.to(torch.float32)
+            else:
+                kwargs[attr.replace("w_", "")] = attr_tensor.to(torch.float32)
+    data_torch = data_torch.to(torch.float32)
 
-            data = ggml_quant(data_torch, data_qtype.name.lower(), device=device, **kwargs)
-        else:
-            # if data_torch.dtype ==torch.float32:
-            #     data_qtype = gguf.GGMLQuantizationType.F32
-            # else:
-            #     data_qtype = gguf.GGMLQuantizationType.F16
-            data_qtype = gguf.GGMLQuantizationType.F32  ##FP16 has issues at inference
-            data = data_torch.to(torch.float32).squeeze().cpu().numpy()
-    else:
-        # for Llama-4
-        # if data_torch.dtype == torch.float32:
-        #     data_qtype = gguf.GGMLQuantizationType.F32
-        # else:
-        #     data_qtype = gguf.GGMLQuantizationType.F16
-        # data = data_torch.squeeze().cpu().numpy()
-        # data_qtype = gguf.GGMLQuantizationType.F32
-        # data = data_torch.to(torch.float32).squeeze().cpu().numpy()
-        data = ggml_quant(data_torch, data_qtype.name.lower(), device=device)
+    data = ggml_quant(data_torch, data_qtype.name.lower(), device=device, **kwargs)
+    # else:
+    #     # if data_torch.dtype ==torch.float32:
+    #     #     data_qtype = gguf.GGMLQuantizationType.F32
+    #     # else:
+    #     #     data_qtype = gguf.GGMLQuantizationType.F16
+    #     data_qtype = gguf.GGMLQuantizationType.F32  ##FP16 has issues at inference
+    #     data = data_torch.to(torch.float32).squeeze().cpu().numpy()
     return data, data_qtype
 
 
@@ -419,7 +413,9 @@ def prepare_tensors(cls):
                     break
             if skip:
                 continue
-            data = data_torch.squeeze()
+            # sync with new version of gguf
+            # data = data_torch.squeeze()
+            data = data_torch
             n_dims = len(data.shape)
             data_qtype: gguf.GGMLQuantizationType | bool = cls.tensor_force_quant(name, new_name, bid, n_dims)
 
@@ -529,17 +525,30 @@ def prepare_tensors(cls):
                 elif data_qtype == gguf.GGMLQuantizationType.Q6_K:
                     data_qtype = gguf.GGMLQuantizationType.Q8_0
 
+            from auto_round.export.export_to_gguf.config import GGML_QUANT_SIZES
+
+            if data_qtype.name.lower() in GGML_QUANT_SIZES:
+                block_size, type_size = GGML_QUANT_SIZES[data_qtype.name.lower()]
+                if data_torch.shape[-1] % block_size != 0:
+                    logger.warning(
+                        f"{new_name}: Can't quantize tensor with shape {data_torch.shape} to {data_qtype.name},"
+                        " fallback to F16"
+                    )
+                    data_qtype = gguf.GGMLQuantizationType.F16
+
             if isinstance(data_qtype, bool) or data_qtype in [
                 gguf.GGMLQuantizationType.F16,
                 gguf.GGMLQuantizationType.BF16,
                 gguf.GGMLQuantizationType.F32,
             ]:
-                data = data_torch.squeeze().cpu().numpy()
+                # sync with new version of gguf
+                # data = data_torch.squeeze().cpu().numpy()
 
                 # if data ends up empty, it means data_torch was a scalar tensor -> restore
-                if len(data.shape) == 0:
+                if len(data_torch.shape) == 0:
                     data = data_torch.numpy()
                 try:
+                    data = data_torch.cpu().numpy()
                     data = gguf.quants.quantize(data, data_qtype)
                 except gguf.QuantError as e:
                     logger.warning("%s, %s", e, "falling back to F16")

diff --git a/auto_round/export/export_to_gguf/packing.py b/auto_round/export/export_to_gguf/packing.py
@@ -22,6 +22,7 @@
 
 
 def register_qtype(name):
+
     def register(cls):
         GGML_QUANT_TYPE[name] = cls
         return cls
@@ -109,7 +110,6 @@ def ggml_quant(
     else:
         new_data = np.concatenate(results, axis=0)
     new_data = new_data.reshape(*shape[:-1], shape[-1] // block_size * type_size)  # Check shape correctness
-    new_data = new_data.reshape(*shape[:-1], -1)
     return new_data
 
 

diff --git a/test/helpers.py b/test/helpers.py
@@ -70,6 +70,18 @@ def slice_layers(module):
 
     if hasattr(model.config, "num_hidden_layers"):
         model.config.num_hidden_layers = num_layers
+    if hasattr(model.config, "text_config"):
+        n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]
+        for key in n_block_keys:
+            if hasattr(model.config.text_config, key):
+                setattr(model.config.text_config, key, num_layers)
+        if hasattr(model.config.text_config, "layer_types"):
+            model.config.text_config.layer_types = model.config.text_config.layer_types[:num_layers]
+    if hasattr(model.config, "vision_config"):
+        n_block_keys = ["n_layers", "num_hidden_layers", "n_layer", "num_layers", "depth", "encoder_layers"]
+        for key in n_block_keys:
+            if hasattr(model.config.vision_config, key):
+                setattr(model.config.vision_config, key, num_layers + 1)
     if hasattr(model.config, "layer_types"):
         model.config.layer_types = model.config.layer_types[:num_layers]
 

diff --git a/test/test_cpu/export/test_gguf_format.py b/test/test_cpu/export/test_gguf_format.py
@@ -190,18 +190,17 @@ def test_all_format(self, tiny_qwen_model_path):
         shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
     def test_vlm_gguf(self):
+        from ...helpers import save_tiny_model
+
         model_name = get_model_path("Qwen/Qwen2-VL-2B-Instruct")
+        tiny_model_path = save_tiny_model(model_name, "./tmp/tiny_qwen_vl_model_path", num_layers=2, is_mllm=True)
         from auto_round import AutoRoundMLLM
-        from auto_round.utils import mllm_load_model
 
-        model, processor, tokenizer, image_processor = mllm_load_model(model_name)
         autoround = AutoRoundMLLM(
-            model,
-            tokenizer=tokenizer,
-            processor=processor,
-            image_processor=image_processor,
+            tiny_model_path,
             iters=0,
             nsamples=8,
+            disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
@@ -211,7 +210,7 @@ def test_vlm_gguf(self):
             if file_name == "mmproj-model.gguf":
                 assert abs(file_size - 2537) < 5.0
             else:
-                assert abs(file_size - 892) < 5.0
+                assert abs(file_size - 238) < 5.0
         shutil.rmtree("./saved", ignore_errors=True)
 
     def test_qtype_setting(self):

diff --git a/test/test_cuda/export/test_gguf.py b/test/test_cuda/export/test_gguf.py
@@ -140,46 +140,46 @@ def test_all_format(self):
                 shutil.rmtree(self.save_dir, ignore_errors=True)
 
     @require_gguf
-    def test_vlm_gguf(self):
-        model_name = "/models/Qwen2-VL-2B-Instruct"
-        from auto_round import AutoRoundMLLM
-        from auto_round.utils import mllm_load_model
+    def test_special_model(self):
+        from ...helpers import save_tiny_model
 
-        model, processor, tokenizer, image_processor = mllm_load_model(model_name)
-        autoround = AutoRoundMLLM(
-            model,
-            tokenizer=tokenizer,
-            processor=processor,
-            image_processor=image_processor,
-            device="auto",
+        model_name = get_model_path("ibm-granite/granite-4.0-h-tiny")
+        tiny_model_path = save_tiny_model(model_name, "tiny_model_path", num_layers=2)
+        from auto_round import AutoRound
+
+        autoround = AutoRound(
+            tiny_model_path,
             iters=0,
+            nsamples=8,
+            disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
-        assert "mmproj-model.gguf" in os.listdir("./saved")
-        file_size = os.path.getsize("./saved/Qwen2-VL-2B-Instruct-Q4_0.gguf") / 1024**2
-        assert abs(file_size - 894) < 5.0
-        file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
-        assert abs(file_size - 2580) < 5.0
+        file_name = os.listdir(quantized_model_path)[0]
+        file_size = os.path.getsize(os.path.join(quantized_model_path, file_name)) / 1024**2
+        assert abs(file_size - 307) < 5.0
         shutil.rmtree("./saved", ignore_errors=True)
 
-        model_name = "/models/gemma-3-12b-it"
+    @require_gguf
+    def test_vlm_gguf(self):
+        from ...helpers import save_tiny_model
+
+        model_name = "/models/gemma-3-4b-it"
+        tiny_model_path = save_tiny_model(model_name, "tiny_model_path", num_layers=2, is_mllm=True)
+        from auto_round import AutoRound
 
-        model, processor, tokenizer, image_processor = mllm_load_model(model_name)
-        autoround = AutoRoundMLLM(
-            model,
-            tokenizer=tokenizer,
-            processor=processor,
-            image_processor=image_processor,
+        autoround = AutoRound(
+            tiny_model_path,
             device="auto",
             nsamples=32,
             iters=0,
+            disable_opt_rtn=True,
         )
         quantized_model_path = "./saved"
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_k_m")
         assert "mmproj-model.gguf" in os.listdir("./saved")
-        file_size = os.path.getsize("./saved/gemma-3-12B-it-Q4_K_M.gguf") / 1024**2
-        assert abs(file_size - 6568) < 5.0
+        file_size = os.path.getsize("./saved/tiny_model_path-860M-Q4_K_M.gguf") / 1024**2
+        assert abs(file_size - 639) < 5.0
         file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
-        assert abs(file_size - 1599) < 5.0
+        assert abs(file_size - 75) < 5.0
         shutil.rmtree(quantized_model_path, ignore_errors=True)