Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ hf download meta-llama/Llama-4-Scout-17B-16E-Instruct --local-dir Llama-4-Scout-
CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=llama4_mxfp4 --input_model=Llama-4-Scout-17B-16E-Instruct/
```

> Note: You can also enable static quantization for KV cache by adding `--static_kv_dtype fp8` argument to `main.py`, or `--static_kv_dtype=fp8` argument to `run_quant.sh` and `run_benchmark.sh`.

## 2. Benchmark

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,36 +25,69 @@
)


class BasicArgumentParser(argparse.ArgumentParser):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.add_argument("--model", "--model_name", "--model_name_or_path",
help="model name or path")

self.add_argument('--scheme', default="MXFP4", type=str,
help="quantizaion scheme.")

self.add_argument("--device", "--devices", default="auto", type=str,
help="the device to be used for tuning. The default is set to auto,"
"allowing for automatic detection."
"Currently, device settings support CPU, GPU, and HPU.")
def setup_parser():
parser = argparse.ArgumentParser(
description="Llama4 quantization.", formatter_class=argparse.ArgumentDefaultsHelpFormatter
)
parser.add_argument(
"--model",
"--model_name",
"--model_name_or_path",
help="model name or path"
)

self.add_argument("--export_format", default="llm_compressor", type=str,
help="the format to save the model"
)
parser.add_argument(
"--scheme",
default="MXFP4",
type=str,
help="quantizaion scheme."
)

self.add_argument("--output_dir", default="./tmp_autoround", type=str,
help="the directory to save quantized model")
parser.add_argument(
"--device",
"--devices",
default="auto",
type=str,
help="the device to be used for tuning. The default is set to auto,"
"allowing for automatic detection."
"Currently, device settings support CPU, GPU, and HPU."
)

self.add_argument("--fp_layers", default="", type=str,
help="layers to maintain original data type")
parser.add_argument(
"--export_format",
default="llm_compressor",
type=str,
help="the format to save the model"
)

parser.add_argument(
"--output_dir",
default="./tmp_autoround",
type=str,
help="the directory to save quantized model"
)

def setup_parser():
parser = BasicArgumentParser()
parser.add_argument(
"--fp_layers",
default="",
type=str,
help="layers to maintain original data type"
)
parser.add_argument(
"--static_kv_dtype",
default=None,
type=str,
choices=["fp8", "float8_e4m3fn"],
help="Data type for static quantize key and value."
)

parser.add_argument("--iters", "--iter", default=0, type=int,
help=" iters")
parser.add_argument(
"--iters",
"--iter",
default=0,
type=int,
help=" iters"
)

args = parser.parse_args()
return args
Expand Down Expand Up @@ -88,6 +121,8 @@ def tune(args):
export_format=args.export_format,
output_dir=args.output_dir,
processor=processor,
static_kv_dtype=args.static_kv_dtype,
reloading=False,
)
model = prepare(model, qconfig)
model = convert(model, qconfig)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ function init_params {
--batch_size=*)
batch_size=$(echo $var |cut -f2 -d=)
;;
--static_kv_dtype=*)
kv_cache_dtype=$(echo $var |cut -f2 -d=)
;;
esac
done

Expand All @@ -37,6 +40,7 @@ function run_benchmark {

extra_model_args=""
extra_cmd=""
kv_cache_dtype=${kv_cache_dtype:="auto"}
batch_size=${batch_size:=1}

if [ "${topology}" = "llama4_mxfp4" ]; then
Expand All @@ -46,7 +50,7 @@ function run_benchmark {
export VLLM_ENABLE_STATIC_MOE=0
export VLLM_USE_DEEP_GEMM=0
export VLLM_ENABLE_AR_EXT=1
extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto,gpu_memory_utilization=0.7"
extra_model_args="max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,gpu_memory_utilization=0.7"
extra_cmd="--gen_kwargs max_gen_toks=2048"
fi

Expand All @@ -57,9 +61,15 @@ function run_benchmark {
model="vllm"
fi

if [[ "${kv_cache_dtype}" == "fp8" ]]; then
export VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION=0
export VLLM_ATTENTION_BACKEND="FLASHINFER"
echo "Using FP8 for KV cache"
fi

NCCL_NVLS_ENABLE=0 VLLM_USE_STANDALONE_COMPILE=0 VLLM_WORKER_MULTIPROC_METHOD=spawn \
lm_eval --model ${model} \
--model_args pretrained=${input_model},tensor_parallel_size=${tp_size},${extra_model_args},enable_expert_parallel=True \
--model_args pretrained=${input_model},tensor_parallel_size=${tp_size},${extra_model_args},enable_expert_parallel=True,kv_cache_dtype=${kv_cache_dtype} \
--tasks ${tasks} \
--batch_size ${batch_size} \
${extra_cmd}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,11 @@ function init_params {
input_model=$(echo $var |cut -f2 -d=)
;;
--output_model=*)
tuned_checkpoint=$(echo $var |cut -f2 -d=)
;;
tuned_checkpoint=$(echo $var |cut -f2 -d=)
;;
--static_kv_dtype=*)
kv_cache_dtype=$(echo $var |cut -f2 -d=)
;;
*)
echo "Error: No such parameter: ${var}"
exit 1
Expand All @@ -42,16 +45,21 @@ function run_tuning {
extra_cmd=""
tuned_checkpoint=${tuned_checkpoint:="saved_results"}
iters=${iters:=0}
kv_cache_dtype=${kv_cache_dtype:="auto"}

if [ "${topology}" = "llama4_mxfp4" ]; then
extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4 --export_format auto_round"
fi

if [[ ! "${kv_cache_dtype}" = "auto" ]]; then
extra_cmd=${extra_cmd}" --static_kv_dtype ${kv_cache_dtype}"
fi

python3 main.py \
--model ${input_model} \
--iters ${iters} \
--output_dir ${tuned_checkpoint} \
${extra_cmd}
--model ${input_model} \
--iters ${iters} \
--output_dir ${tuned_checkpoint} \
${extra_cmd}
}

main "$@"