Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion auto_round/formats.py
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,7 @@ def save_quantized(

@OutputFormat.register("auto_awq")
class AutoAWQFormat(OutputFormat):
support_schemes = ["W4A16", "W2A16", "W3A16", "W8A16", "BF16", "W2A16G64", "W2A16G32"]
support_schemes = ["W4A16"]
format_name = "auto_awq"

@staticmethod
Expand Down
3 changes: 1 addition & 2 deletions docs/step_by_step.md
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,10 @@ adopted within the community, **only 4-bits quantization is supported**. Please
|export format | supported scheme |
|--------------|------------------|
|**auto_round** | W4A16, W2A16, W3A16, W8A16, MXFP4, MXFP8, NVFP4, FPW8A16, W2A16G64, W2A16G32, FP8_STATIC, BF16|
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Schemes without kernel support are shown in gray (or marked differently).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This issue has not been resolved

|**auto_awq / auto_round:auto_awq** | W4A16, W2A16, W3A16, W8A16, BF16, W2A16G64, W2A16G32 |
|**auto_awq / auto_round:auto_awq** | W4A16|
|**auto_gptq / auto_round:auto_gptq / auto_round:gptqmodel**|W4A16, W2A16, W3A16, W8A16, BF16, W2A16G64, W2A16G32|
|**llm_compressor / auto_round:llm_compressor** | MXFP4, MXFP8, NVFP4, FPW8A16, FP8_STATIC |
|**gguf** | GGUF:Q4_0, GGUF:Q4_1, GGUF:Q5_0, GGUF:Q5_1, GGUF:Q2_K_S, GGUF:Q3_K_S, GGUF:Q3_K_M, GGUF:Q3_K_L, GGUF:Q4_K_S, GGUF:Q4_K_M, GGUF:Q5_K_S, GGUF:Q5_K_M, GGUF:Q6_K, GGUF:Q8_0 |
|**itrex / itrex_xpu** | W4A16, W2A16, W3A16, W8A16, BF16, W2A16G64, W2A16G32 |
|**fake** | all scheme|
### Hardware Compatibility

Expand Down
1 change: 1 addition & 0 deletions test/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

# Automatic choose local path or model name.
def get_model_path(model_name: str) -> str:
model_name = model_name.rstrip("/")
ut_path = f"/tf_dataset/auto_round/models/{model_name}"
local_path = f"/models/{model_name.split('/')[-1]}"

Expand Down
17 changes: 9 additions & 8 deletions test/test_cpu/test_cli_usage.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@


class TestAutoRoundCmd:

@classmethod
def setup_class(self):
pass
Expand All @@ -21,48 +22,48 @@ def test_auto_round_cmd(self, tiny_opt_model_path, tiny_qwen_vl_model_path):
python_path = sys.executable

# Test llm script
res = os.system(f"cd .. && {python_path} -m auto_round -h")
res = os.system(f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round -h")
if res > 0 or res == -1:
assert False, "cmd line test fail, please have a check"

res = os.system(
f"cd .. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa"
f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 32 --iter 2 --nsamples 1 --format auto_gptq,auto_round --output_dir ./saved --tasks piqa"
)
if res > 0 or res == -1:
assert False, "cmd line test fail, please have a check"

res = os.system(
f"cd .. && {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {tiny_opt_model_path} --seqlen 8 --iter 1 --nsamples 1 --eval_task_by_task --tasks openbookqa --bs 32"
)
if res > 0 or res == -1:
assert False, "cmd line test fail, please have a check"

res = os.system(
f"cd .. && {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai"
f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -c 'from auto_round.__main__ import run_light; run_light()' --seqlen 8 --iter 2 --nsamples 8 --output_dir ./saved --tasks lambada_openai"
)
if res > 0 or res == -1:
assert False, "cmd line test fail, please have a check"

# test mllm script

# test auto_round_mllm --eval help
res = os.system(f"cd .. && {python_path} -m auto_round --eval -h")
res = os.system(f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --eval -h")
if res > 0 or res == -1:
assert False, "cmd line test fail, please have a check"

# test auto_round_mllm --lmms help
res = os.system(f"cd .. && {python_path} -m auto_round --eval --lmms -h")
res = os.system(f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --eval --lmms -h")
if res > 0 or res == -1:
assert False, "cmd line test fail, please have a check"

res = os.system(
f"cd .. && {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --mllm --model {tiny_qwen_vl_model_path} --iter 2 --nsamples 2 --seqlen 32 --format auto_round --output_dir ./saved"
)
if res > 0 or res == -1:
assert False, "cmd line test fail, please have a check"

res = os.system(
f"cd .. && {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round"
f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --mllm --iter 2 --nsamples 2 --model {tiny_qwen_vl_model_path} --seqlen 32 --format auto_round"
" --quant_nontext_module --output_dir ./saved "
)
if res > 0 or res == -1:
Expand Down
10 changes: 5 additions & 5 deletions test/test_cpu/test_gguf_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,15 +26,15 @@ def teardown_class(self):
def test_basic_usage(self, tiny_gemma_model_path, tiny_qwen_model_path):
python_path = sys.executable
res = os.system(
f"cd .. && {python_path} -m auto_round --model {tiny_gemma_model_path} "
f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {tiny_gemma_model_path} "
f" --bs 16 --iters 0 --nsamples 1 --format gguf:q4_k_m"
)
if res > 0 or res == -1:
assert False, "cmd line test fail, please have a check"
shutil.rmtree("./saved", ignore_errors=True)

res = os.system(
f"cd .. && {python_path} -m auto_round --model {tiny_qwen_model_path}"
f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {tiny_qwen_model_path}"
f" --bs 16 --iters 1 --nsamples 1 --format fake,gguf:q4_0"
)
if res > 0 or res == -1:
Expand Down Expand Up @@ -162,15 +162,15 @@ def test_all_format(self, tiny_qwen_model_path):
# for gguf_format in ["gguf:q4_0", "gguf:q4_1", "gguf:q4_k_m", "gguf:q6_k"]:
for gguf_format in ["gguf:q4_k_m"]:
res = os.system(
f"cd .. && {python_path} -m auto_round --model {model_name} "
f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {model_name} "
f" --bs 16 --iters 1 --nsamples 1 --seqlen 16 --format {gguf_format}"
)
if res > 0 or res == -1:
assert False, "cmd line test fail, please have a check"
shutil.rmtree("../../tmp_autoround", ignore_errors=True)

res = os.system(
f"cd .. && {python_path} -m auto_round --model {model_name}"
f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {model_name}"
f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --format fake,{gguf_format}"
)
if res > 0 or res == -1:
Expand All @@ -179,7 +179,7 @@ def test_all_format(self, tiny_qwen_model_path):

# test mixed q2_k_s
res = os.system(
f"cd .. && {python_path} -m auto_round --model {model_name}"
f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model {model_name}"
f" --bs 16 --iters 0 --nsamples 1 --seqlen 16 --scheme GGUF:Q2_K_MIXED"
)
if res > 0 or res == -1:
Expand Down
4 changes: 2 additions & 2 deletions test/test_cuda/test_alg_ext.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,13 @@ def test_cli(self, tiny_opt_model_path):
python_path = sys.executable

res = os.system(
f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32"
f"PYTHONPATH='../..:$PYTHONPATH' CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 2 --options=W2A16,W4A16 --ignore_scale_zp_bits --nsamples 1 --seqlen 32"
)
if res > 0 or res == -1:
assert False, "cmd line test fail, please have a check"

res = os.system(
f"cd .. && CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32"
f"PYTHONPATH='../..:$PYTHONPATH' CUDA_VISIBLE_DEVICES=0 {python_path} -m auto_round --model {tiny_opt_model_path} --iters 1 --device auto --enable_alg_ext --avg_bits 5.5 --options=mxfp4,mxfp8 --ignore_scale_zp_bits --enable_torch_compile --nsamples 1 --seqlen 32"
)
if res > 0 or res == -1:
assert False, "cmd line test fail, please have a check"
Expand Down
24 changes: 12 additions & 12 deletions test/test_cuda/test_fp8_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ def test_gguf_imatrix(self):
# print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50)[0]))

def test_small_model_rtn(self):
model, tokenizer = self.tiny_fp8_model()
ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
ar = AutoRound(model=model_name, iters=0)
_, folder = ar.quantize_and_save(output_dir=self.save_dir)
model_args = f"pretrained={self.save_dir}"
result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
Expand All @@ -75,8 +75,8 @@ def test_small_model_rtn(self):
shutil.rmtree(self.save_dir, ignore_errors=True)

def test_small_model_iters1(self):
model, tokenizer = self.tiny_fp8_model()
ar = AutoRound(model=model, tokenizer=tokenizer, iters=1)
model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
ar = AutoRound(model=model_name, iters=1)
_, folder = ar.quantize_and_save(output_dir=self.save_dir)
model_args = f"pretrained={self.save_dir}"
result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
Expand All @@ -86,25 +86,25 @@ def test_small_model_iters1(self):
shutil.rmtree(self.save_dir, ignore_errors=True)

def test_medium_model_rtn(self):
model, tokenizer = self.tiny_fp8_model()
ar = AutoRound(model=model, tokenizer=tokenizer, iters=0)
model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
ar = AutoRound(model=model_name, iters=0)
_, folder = ar.quantize_and_save(output_dir=self.save_dir)
model_args = f"pretrained={self.save_dir}"
result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
print(result["results"]["lambada_openai"]["acc,none"])
assert result["results"]["lambada_openai"]["acc,none"] > 0.55
assert result["results"]["lambada_openai"]["acc,none"] > 0.33

shutil.rmtree(self.save_dir, ignore_errors=True)

def test_medium_model_rtn_with_lm_head(self):
model, tokenizer = self.tiny_fp8_model()
model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
layer_config = {"lm_head": {"bits": 4}}
ar = AutoRound(model=model, tokenizer=tokenizer, iters=0, layer_config=layer_config)
ar = AutoRound(model=model_name, iters=0, layer_config=layer_config)
_, folder = ar.quantize_and_save(output_dir=self.save_dir)
model_args = f"pretrained={self.save_dir}"
result = simple_evaluate(model="hf", model_args=model_args, tasks="lambada_openai", batch_size="auto")
print(result["results"]["lambada_openai"]["acc,none"])
assert result["results"]["lambada_openai"]["acc,none"] > 0.55
assert result["results"]["lambada_openai"]["acc,none"] > 0.33

shutil.rmtree(self.save_dir, ignore_errors=True)

Expand Down Expand Up @@ -135,9 +135,9 @@ def test_fp8_model_gguf(self):

def test_diff_datatype(self):
for scheme in ["NVFP4", "MXFP4"]:
model, tokenizer = self.tiny_fp8_model()
model_name = get_model_path("qwen/Qwen3-0.6B-FP8")
for iters in [0, 1]:
print(f"Testing scheme: {scheme}, iters: {iters}")
ar = AutoRound(model=model, tokenizer=tokenizer, iters=iters, scheme=scheme)
ar = AutoRound(model_name, iters=iters, scheme=scheme)
ar.quantize_and_save(output_dir=self.save_dir)
shutil.rmtree(self.save_dir, ignore_errors=True)
8 changes: 4 additions & 4 deletions test/test_cuda/test_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,16 @@ def test_gguf_format(self, tiny_qwen_model_path, dataloader):

save_dir = os.path.join(os.path.dirname(__file__), "saved")
res = os.system(
f"cd .. && {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 "
f"PYTHONPATH='../..:$PYTHONPATH' {sys.executable} -m auto_round --model {tiny_qwen_model_path} --iter 2 "
f"--output_dir {save_dir} --nsample 2 --format gguf:q4_0 --device 0"
)
print(save_dir)
assert not (res > 0 or res == -1), "qwen2 tuning fail"

from llama_cpp import Llama

gguf_file = os.listdir(f"{save_dir}/tmp_tiny_qwen_model_path-gguf")[0]
llm = Llama(f"{save_dir}/tmp_tiny_qwen_model_path-gguf/{gguf_file}", n_gpu_layers=-1)
gguf_file = os.listdir(f"{save_dir}/tiny_qwen_model_path-gguf")[0]
llm = Llama(f"{save_dir}/tiny_qwen_model_path-gguf/{gguf_file}", n_gpu_layers=-1)
output = llm("There is a girl who likes adventure,", max_tokens=32)
print(output)
shutil.rmtree(save_dir, ignore_errors=True)
Expand Down Expand Up @@ -155,7 +155,7 @@ def test_vlm_gguf(self):
autoround.quantize_and_save(output_dir=quantized_model_path, format="gguf:q4_0")
assert "mmproj-model.gguf" in os.listdir("./saved")
file_size = os.path.getsize("./saved/Qwen2-VL-2B-Instruct-Q4_0.gguf") / 1024**2
assert abs(file_size - 4242) < 5.0
assert abs(file_size - 894) < 5.0
file_size = os.path.getsize("./saved/mmproj-model.gguf") / 1024**2
assert abs(file_size - 2580) < 5.0
shutil.rmtree("./saved", ignore_errors=True)
Expand Down
3 changes: 2 additions & 1 deletion test/test_cuda/test_marlin_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,12 @@
from auto_round import AutoRound, AutoRoundConfig
from auto_round.eval.evaluation import simple_evaluate_user_model

from ..helpers import model_infer
from ..helpers import get_model_path, model_infer


class TestAutoRoundMarlinBackend:
save_dir = "./saved"
model_name = get_model_path("facebook/opt-125m")

@pytest.fixture(autouse=True, scope="class")
def setup_and_teardown_class(self):
Expand Down
1 change: 0 additions & 1 deletion test/test_cuda/test_mix_bits.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,6 @@ def test_mixed_autoround_format_vllm(self, tiny_opt_model_path, dataloader):
}
autoround = AutoRound(
tiny_opt_model_path,
self.tokenizer,
scheme="W4A16",
iters=2,
seqlen=2,
Expand Down
2 changes: 1 addition & 1 deletion test/test_cuda/test_multiple_card.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,7 @@ def test_device_map_for_triton(self):

@multi_card
def test_mllm_device_map(self):
model_name = get_model_path("qwen/Qwen2-VL-2B-Instruct/")
model_name = get_model_path("qwen/Qwen2-VL-2B-Instruct")
from auto_round import AutoRoundMLLM

device_map = "0,1"
Expand Down
4 changes: 2 additions & 2 deletions test/test_cuda/test_multiple_card_calib.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def test_multiple_card_calib(self):

##test llm script
res = os.system(
f"cd .. && {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None"
f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model /models/Meta-Llama-3.1-8B-Instruct --devices '0,1' --quant_lm_head --iters 1 --nsamples 1 --output_dir None"
)
if res > 0 or res == -1:
assert False, "cmd line test fail, please have a check"
Expand All @@ -52,7 +52,7 @@ def test_multiple_card_nvfp4(self):

##test llm script
res = os.system(
f"cd .. && {python_path} -m auto_round --model facebook/opt-125m --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage"
f"PYTHONPATH='../..:$PYTHONPATH' {python_path} -m auto_round --model facebook/opt-125m --scheme NVFP4 --devices '0,1' --iters 1 --nsamples 1 --enable_torch_compile --low_gpu_mem_usage"
)
if res > 0 or res == -1:
assert False, "cmd line test fail, please have a check"
2 changes: 1 addition & 1 deletion test/test_cuda/test_scheme.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_w2a16(self, tiny_opt_model_path):
ar.quantize()

def test_mxfp4(self, tiny_opt_model_path):
ar = AutoRound(tiny_opt_model_path, scheme="MXFP4", nsamples=1, iters=1)
ar = AutoRound(tiny_opt_model_path, scheme="MXFP8_RCEIL", nsamples=1, iters=1)
assert ar.bits == 4
assert ar.act_bits == 4
assert ar.data_type == "mx_fp"
Expand Down
11 changes: 6 additions & 5 deletions test/test_cuda/test_support_vlms.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@


class TestSupportVLMS:

@classmethod
def setup_class(self):
self.save_dir = os.path.join(os.path.dirname(__file__), "ut_saved")
Expand All @@ -26,7 +27,7 @@ def test_qwen2(self):
model_path = "/models/Qwen2-VL-2B-Instruct/"
# test tune
res = os.system(
f"cd .. && {self.python_path} -m auto_round --mllm "
f"PYTHONPATH='../..:$PYTHONPATH' {self.python_path} -m auto_round --mllm "
f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}"
)
assert not (res > 0 or res == -1), "qwen2 tuning fail"
Expand Down Expand Up @@ -81,7 +82,7 @@ def test_phi3(self):
model_path = "/models/Phi-3.5-vision-instruct/"
## test tune
res = os.system(
f"cd .. && {self.python_path} -m auto_round --mllm "
f"PYTHONPATH='../..:$PYTHONPATH' {self.python_path} -m auto_round --mllm "
f"--model {model_path} --iter 2 --output_dir {self.save_dir} --device {self.device}"
)
assert not (res > 0 or res == -1), "Phi-3.5 tuning fail"
Expand Down Expand Up @@ -129,7 +130,7 @@ def test_phi3_vision_awq(self):
model_path = "/models/Phi-3.5-vision-instruct/"
## test tune
res = os.system(
f"cd .. && {self.python_path} -m auto_round --mllm "
f"PYTHONPATH='../..:$PYTHONPATH' {self.python_path} -m auto_round --mllm "
f"--model {model_path} --iter 2 --quant_nontext_module "
f"--nsample 64 --seqlen 32 "
f"--format auto_awq --output_dir {self.save_dir} --device {self.device}"
Expand Down Expand Up @@ -177,7 +178,7 @@ def test_glm(self):
model_path = "/models/glm-4v-9b/"
## test tune
res = os.system(
f"cd .. && {self.python_path} -m auto_round "
f"PYTHONPATH='../..:$PYTHONPATH' {self.python_path} -m auto_round "
f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}"
)
assert not (res > 0 or res == -1), "glm-4v-9b tuning fail"
Expand All @@ -186,7 +187,7 @@ def test_granite_vision(self):
model_path = "/models/granite-vision-3.2-2b"
## test tune
res = os.system(
f"cd .. && {self.python_path} -m auto_round "
f"PYTHONPATH='../..:$PYTHONPATH' {self.python_path} -m auto_round "
f"--model {model_path} --iter 1 --output_dir {self.save_dir} --device {self.device}"
)
assert not (res > 0 or res == -1), "granite-vision-3.2-2b tuning fail"
Loading