intel · n1ck-guo · Jan 12, 2026 · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026
diff --git a/auto_round/formats.py b/auto_round/formats.py
@@ -493,7 +493,7 @@ def save_quantized(
 
 @OutputFormat.register("auto_awq")
 class AutoAWQFormat(OutputFormat):
-    support_schemes = ["W4A16", "W2A16", "W3A16", "W8A16", "BF16", "W2A16G64", "W2A16G32"]
+    support_schemes = ["W4A16"]
     format_name = "auto_awq"
 
     @staticmethod

diff --git a/docs/step_by_step.md b/docs/step_by_step.md
@@ -152,11 +152,10 @@ adopted within the community, **only 4-bits quantization is supported**. Please
 |export format | supported scheme |
 |--------------|------------------|
 |**auto_round**    | W4A16, W2A16, W3A16, W8A16, MXFP4, MXFP8, NVFP4, FPW8A16, W2A16G64, W2A16G32, FP8_STATIC, BF16|
-|**auto_awq / auto_round:auto_awq**      | W4A16, W2A16, W3A16, W8A16, BF16, W2A16G64, W2A16G32 |
+|**auto_awq / auto_round:auto_awq**      | W4A16|
 |**auto_gptq / auto_round:auto_gptq / auto_round:gptqmodel**|W4A16, W2A16, W3A16, W8A16, BF16, W2A16G64, W2A16G32|
 |**llm_compressor / auto_round:llm_compressor** | MXFP4, MXFP8, NVFP4, FPW8A16, FP8_STATIC |
 |**gguf** | GGUF:Q4_0, GGUF:Q4_1, GGUF:Q5_0, GGUF:Q5_1, GGUF:Q2_K_S, GGUF:Q3_K_S, GGUF:Q3_K_M, GGUF:Q3_K_L, GGUF:Q4_K_S, GGUF:Q4_K_M, GGUF:Q5_K_S, GGUF:Q5_K_M, GGUF:Q6_K, GGUF:Q8_0 |
-|**itrex / itrex_xpu** |  W4A16, W2A16, W3A16, W8A16, BF16, W2A16G64, W2A16G32 |
 |**fake** | all scheme|
 ### Hardware Compatibility
 

diff --git a/test/helpers.py b/test/helpers.py
@@ -10,6 +10,7 @@
 
 # Automatic choose local path or model name.
 def get_model_path(model_name: str) -> str:
+    model_name = model_name.rstrip("/")
     ut_path = f"/tf_dataset/auto_round/models/{model_name}"
     local_path = f"/models/{model_name.split('/')[-1]}"
 

diff --git a/test/test_cpu/test_cli_usage.py b/test/test_cpu/test_cli_usage.py
@@ -6,6 +6,7 @@
 
 
 class TestAutoRoundCmd:
+
     @classmethod
     def setup_class(self):
         pass
@@ -21,48 +22,48 @@ def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path):
         python_path = sys.executable
 
         # Test llm script
-        res = os.system(f"cd .. && {python_path} -m auto_round -h")
+        res = os.system(f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved  --tasks piqa"
+            f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved  --tasks piqa"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
+            f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd .. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai"
+            f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         # test mllm script
 
         # test auto_round_mllm --eval help
-        res = os.system(f"cd .. && {python_path} -m auto_round --eval -h")
+        res = os.system(f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --eval -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         # test auto_round_mllm --lmms help
-        res = os.system(f"cd .. && {python_path} -m auto_round --eval --lmms -h")
+        res = os.system(f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --eval --lmms -h")
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
+            f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round"
+            f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round"
             " --quant_nontext_module --output_dir ./saved "
         )
         if res > 0 or res == -1:

diff --git a/test/test_cpu/test_gguf_format.py b/test/test_cpu/test_gguf_format.py
@@ -26,15 +26,15 @@ def teardown_class(self):
     def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path):
         python_path = sys.executable
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model {tiny_gemma_model_path} "
+            f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {tiny_gemma_model_path} "
             f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
         shutil.rmtree("./saved", ignore_errors=True)
 
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model {tiny_qwen_model_path}"
+            f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {tiny_qwen_model_path}"
             f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
         )
         if res > 0 or res == -1:
@@ -162,15 +162,15 @@ def test_all_format(self, tiny_qwen_model_path):
         # for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]:
         for gguf_format in ["gguf:q4_k_m"]:
             res = os.system(
-                f"cd .. && {python_path} -m auto_round --model {model_name} "
+                f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {model_name} "
                 f" --bs 16 --iters 1 --nsamples 1 --seqlen 16 --format {gguf_format}"
             )
             if res > 0 or res == -1:
                 assert False, "cmd line test fail, please have a check"
             shutil.rmtree("../../tmp_autoround", ignore_errors=True)
 
             res = os.system(
-                f"cd .. && {python_path} -m auto_round --model {model_name}"
+                f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {model_name}"
                 f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --format fake,{gguf_format}"
             )
             if res > 0 or res == -1:
@@ -179,7 +179,7 @@ def test_all_format(self, tiny_qwen_model_path):
 
         # test mixed q2_k_s
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model {model_name}"
+            f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {model_name}"
             f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --scheme GGUF:Q2_K_MIXED"
         )
         if res > 0 or res == -1:

diff --git a/test/test_cuda/test_alg_ext.py b/test/test_cuda/test_alg_ext.py
@@ -49,13 +49,13 @@ def test_cli(self, tiny_opt_model_path):
         python_path = sys.executable
 
         res = os.system(
-            f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32"
+            f"PYTHONPATH='../..:$PYTHONPATH' CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
 
         res = os.system(
-            f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32"
+            f"PYTHONPATH='../..:$PYTHONPATH' CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"

diff --git a/test/test_cuda/test_fp8_input.py b/test/test_cuda/test_fp8_input.py
@@ -64,8 +64,8 @@ def test_gguf_imatrix(self):
         # print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))
 
     def test_small_model_rtn(self):
-        model, tokenizer = self.tiny_fp8_model()
-        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
+        model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
+        ar = AutoRound(model=model_name, iters=0)
         _, folder = ar.quantize_and_save(output_dir=self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
@@ -75,8 +75,8 @@ def test_small_model_rtn(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_small_model_iters1(self):
-        model, tokenizer = self.tiny_fp8_model()
-        ar = AutoRound(model=model, tokenizer=tokenizer, iters=1)
+        model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
+        ar = AutoRound(model=model_name, iters=1)
         _, folder = ar.quantize_and_save(output_dir=self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
@@ -86,25 +86,25 @@ def test_small_model_iters1(self):
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_medium_model_rtn(self):
-        model, tokenizer = self.tiny_fp8_model()
-        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
+        model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
+        ar = AutoRound(model=model_name, iters=0)
         _, folder = ar.quantize_and_save(output_dir=self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        assert result["results"]["lambada_openai"]["acc,none"] > 0.55
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.33
 
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
     def test_medium_model_rtn_with_lm_head(self):
-        model, tokenizer = self.tiny_fp8_model()
+        model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
         layer_config = {"lm_head": {"bits": 4}}
-        ar = AutoRound(model=model, tokenizer=tokenizer, iters=0, layer_config=layer_config)
+        ar = AutoRound(model=model_name, iters=0, layer_config=layer_config)
         _, folder = ar.quantize_and_save(output_dir=self.save_dir)
         model_args = f"pretrained={self.save_dir}"
         result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
         print(result["results"]["lambada_openai"]["acc,none"])
-        assert result["results"]["lambada_openai"]["acc,none"] > 0.55
+        assert result["results"]["lambada_openai"]["acc,none"] > 0.33
 
         shutil.rmtree(self.save_dir, ignore_errors=True)
 
@@ -135,9 +135,9 @@ def test_fp8_model_gguf(self):
 
     def test_diff_datatype(self):
         for scheme in ["NVFP4", "MXFP4"]:
-            model, tokenizer = self.tiny_fp8_model()
+            model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
             for iters in [0, 1]:
                 print(f"Testing scheme: {scheme}, iters: {iters}")
-                ar = AutoRound(model=model, tokenizer=tokenizer, iters=iters, scheme=scheme)
+                ar = AutoRound(model_name, iters=iters, scheme=scheme)
                 ar.quantize_and_save(output_dir=self.save_dir)
                 shutil.rmtree(self.save_dir, ignore_errors=True)
diff --git a/test/test_cuda/test_gguf.py b/test/test_cuda/test_gguf.py
@@ -56,16 +56,16 @@ def test_gguf_format(self, tiny_qwen_model_path, dataloader):
 
         save_dir = os.path.join(os.path.dirname(__file__), "saved")
         res = os.system(
-            f"cd .. && {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 "
+            f"PYTHONPATH='../..:$PYTHONPATH' {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 "
             f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0"
         )
         print(save_dir)
         assert not (res > 0 or res == -1), "qwen2 tuning fail"
 
         from llama_cpp import Llama
 
-        gguf_file = os.listdir(f"{save_dir}/tmp_tiny_qwen_model_path-gguf")[0]
-        llm = Llama(f"{save_dir}/tmp_tiny_qwen_model_path-gguf/{gguf_file}", n_gpu_layers=-1)
+        gguf_file = os.listdir(f"{save_dir}/tiny_qwen_model_path-gguf")[0]
+        llm = Llama(f"{save_dir}/tiny_qwen_model_path-gguf/{gguf_file}", n_gpu_layers=-1)
         output = llm("There is a girl who likes adventure,", max_tokens=32)
         print(output)
         shutil.rmtree(save_dir, ignore_errors=True)
@@ -155,7 +155,7 @@ def test_vlm_gguf(self):
         autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
         assert "mmproj-model.gguf" in os.listdir("./saved")
         file_size = os.path.getsize("./saved/Qwen2-VL-2B-Instruct-Q4_0.gguf") / 1024**2
-        assert abs(file_size - 4242) < 5.0
+        assert abs(file_size - 894) < 5.0
         file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
         assert abs(file_size - 2580) < 5.0
         shutil.rmtree("./saved", ignore_errors=True)

diff --git a/test/test_cuda/test_marlin_backend.py b/test/test_cuda/test_marlin_backend.py
@@ -7,11 +7,12 @@
 from auto_round import AutoRound, AutoRoundConfig
 from auto_round.eval.evaluation import simple_evaluate_user_model
 
-from ..helpers import model_infer
+from ..helpers import get_model_path, model_infer
 
 
 class TestAutoRoundMarlinBackend:
     save_dir = "./saved"
+    model_name = get_model_path("facebook/opt-125m")
 
     @pytest.fixture(autouse=True, scope="class")
     def setup_and_teardown_class(self):

diff --git a/test/test_cuda/test_mix_bits.py b/test/test_cuda/test_mix_bits.py
@@ -242,7 +242,6 @@ def test_mixed_autoround_format_vllm(self, tiny_opt_model_path, dataloader):
         }
         autoround = AutoRound(
             tiny_opt_model_path,
-            self.tokenizer,
             scheme="W4A16",
             iters=2,
             seqlen=2,

diff --git a/test/test_cuda/test_multiple_card.py b/test/test_cuda/test_multiple_card.py
@@ -354,7 +354,7 @@ def test_device_map_for_triton(self):
 
     @multi_card
     def test_mllm_device_map(self):
-        model_name = get_model_path("qwen/Qwen2-VL-2B-Instruct/")
+        model_name = get_model_path("qwen/Qwen2-VL-2B-Instruct")
         from auto_round import AutoRoundMLLM
 
         device_map = "0,1"

diff --git a/test/test_cuda/test_multiple_card_calib.py b/test/test_cuda/test_multiple_card_calib.py
@@ -41,7 +41,7 @@ def test_multiple_card_calib(self):
 
         ##test llm script
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None"
+            f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
@@ -52,7 +52,7 @@ def test_multiple_card_nvfp4(self):
 
         ##test llm script
         res = os.system(
-            f"cd .. && {python_path} -m auto_round --model facebook/opt-125m  --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage"
+            f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model facebook/opt-125m  --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage"
         )
         if res > 0 or res == -1:
             assert False, "cmd line test fail, please have a check"
diff --git a/test/test_cuda/test_scheme.py b/test/test_cuda/test_scheme.py
@@ -42,7 +42,7 @@ def test_w2a16(self, tiny_opt_model_path):
         ar.quantize()
 
     def test_mxfp4(self, tiny_opt_model_path):
-        ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1)
+        ar = AutoRound(tiny_opt_model_path, scheme="MXFP8_RCEIL", nsamples=1, iters=1)
         assert ar.bits == 4
         assert ar.act_bits == 4
         assert ar.data_type == "mx_fp"

diff --git a/test/test_cuda/test_support_vlms.py b/test/test_cuda/test_support_vlms.py
@@ -11,6 +11,7 @@
 
 
 class TestSupportVLMS:
+
     @classmethod
     def setup_class(self):
         self.save_dir = os.path.join(os.path.dirname(__file__), "ut_saved")
@@ -26,7 +27,7 @@ def test_qwen2(self):
         model_path = "/models/Qwen2-VL-2B-Instruct/"
         # test tune
         res = os.system(
-            f"cd .. && {self.python_path} -m auto_round --mllm "
+            f"PYTHONPATH='../..:$PYTHONPATH' {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}"
         )
         assert not (res > 0 or res == -1), "qwen2 tuning fail"
@@ -81,7 +82,7 @@ def test_phi3(self):
         model_path = "/models/Phi-3.5-vision-instruct/"
         ## test tune
         res = os.system(
-            f"cd .. && {self.python_path} -m auto_round --mllm "
+            f"PYTHONPATH='../..:$PYTHONPATH' {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}"
         )
         assert not (res > 0 or res == -1), "Phi-3.5 tuning fail"
@@ -129,7 +130,7 @@ def test_phi3_vision_awq(self):
         model_path = "/models/Phi-3.5-vision-instruct/"
         ## test tune
         res = os.system(
-            f"cd .. && {self.python_path} -m auto_round --mllm "
+            f"PYTHONPATH='../..:$PYTHONPATH' {self.python_path} -m auto_round --mllm "
             f"--model {model_path} --iter 2 --quant_nontext_module "
             f"--nsample 64 --seqlen 32 "
             f"--format auto_awq --output_dir {self.save_dir} --device {self.device}"
@@ -177,7 +178,7 @@ def test_glm(self):
         model_path = "/models/glm-4v-9b/"
         ## test tune
         res = os.system(
-            f"cd .. && {self.python_path} -m auto_round "
+            f"PYTHONPATH='../..:$PYTHONPATH' {self.python_path} -m auto_round "
             f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}"
         )
         assert not (res > 0 or res == -1), "glm-4v-9b tuning fail"
@@ -186,7 +187,7 @@ def test_granite_vision(self):
         model_path = "/models/granite-vision-3.2-2b"
         ## test tune
         res = os.system(
-            f"cd .. && {self.python_path} -m auto_round "
+            f"PYTHONPATH='../..:$PYTHONPATH' {self.python_path} -m auto_round "
             f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}"
         )
         assert not (res > 0 or res == -1), "granite-vision-3.2-2b tuning fail"