diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md index 74ff5e0820a..2faac68dabe 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md @@ -78,6 +78,8 @@ Notes: Here we provide several recipes for Llama3 models. The relative accuracy loss of quantized model should be less than 1%. +> Note: You can also enable static quantization for KV cache by adding `--static_kv_dtype fp8` argument to `quantize.py`, or `--static_kv_dtype=fp8` argument to `run_quant.sh` and `run_benchmark.sh`. + #### Llama 3.1 8B MXFP8 RTN (Round-to-Nearest) is enough to keep accuracy. @@ -110,7 +112,7 @@ CUDA_VISIBLE_DEVICES=0 python quantize.py \ --low_gpu_mem_usage \ --export_format auto_round \ --export_path llama3.1-8B-MXFP4-MXFP8 \ - --tasks mmlu piqa hellaswag gsm8k \ + --tasks mmlu_llama piqa hellaswag gsm8k_llama \ --eval_batch_size 32 ``` @@ -208,8 +210,7 @@ CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8 The script automatically: - Detects available GPUs from `CUDA_VISIBLE_DEVICES` and sets `tensor_parallel_size` accordingly -- Handles different `add_bos_token` settings for different tasks (GSM8K requires `False`, others use `True`) -- Runs default tasks: `piqa,hellaswag,mmlu,gsm8k` with batch size 8 +- Runs default tasks: `piqa,hellaswag,mmlu_llama,gsm8k_llama` with batch size 8 - Supports custom task selection and batch size adjustment diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py index f51fb19a8c6..feff13f9d20 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py @@ -162,6 +162,13 @@ def get_accuracy(model_name_or_path, tokenizer=None, eval_tasks="mmlu", limit=No default=[], help="[mix-precision] ensure that listed layers are using same data type for quantization" ) + parser.add_argument( + "--static_kv_dtype", + default=None, + type=str, + choices=["fp8", "float8_e4m3fn"], + help="Data type for static quantize key and value.", + ) parser.add_argument("--use_recipe", action="store_true", help="whether to use recipe to quantize model") parser.add_argument("--recipe_file", type=str, default="recipes/Meta-Llama-3.1-8B-Instruct_6bits.json", help="path of recipe file") parser.add_argument("--iters", default=200, type=int, help="iters for autoround.") @@ -248,6 +255,7 @@ def load_recipe_results(file_path): target_bits=args.target_bits, options=args.options, shared_layers=args.shared_layers, + static_kv_dtype=args.static_kv_dtype, enable_torch_compile=args.enable_torch_compile, low_gpu_mem_usage=args.low_gpu_mem_usage, export_format=args.export_format, diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh index 6a07fbd9991..6aaa3a4da56 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh @@ -6,6 +6,7 @@ TASKS="piqa,hellaswag,mmlu_llama,gsm8k_llama" BATCH_SIZE=64 GPU_MEMORY_UTILIZATION=0.8 +KV_CACHE_DTYPE="auto" while [[ $# -gt 0 ]]; do case $1 in @@ -25,6 +26,10 @@ while [[ $# -gt 0 ]]; do GPU_MEMORY_UTILIZATION="${1#*=}" shift ;; + --static_kv_dtype=*) + KV_CACHE_DTYPE="${1#*=}" + shift + ;; *) echo "Unknown parameter: $1" exit 1 @@ -32,6 +37,13 @@ while [[ $# -gt 0 ]]; do esac done +# for fp8 kv cache +if [[ "$KV_CACHE_DTYPE" == "fp8" ]]; then + export VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION=1 + export VLLM_ATTENTION_BACKEND="FLASHINFER" + echo "Using FP8 for KV cache" +fi + # Validate required parameters if [[ -z "$MODEL_PATH" ]]; then echo "Usage: bash run_benchmark.sh --model_path= [--tasks=] [--batch_size=]" @@ -75,11 +87,11 @@ run_evaluation() { echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)" # Print the command being executed - local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 --tasks $tasks --batch_size $BATCH_SIZE $extra_args" + local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192,kv_cache_dtype=${KV_CACHE_DTYPE} --tasks $tasks --batch_size $BATCH_SIZE $extra_args" echo "Executing command: $cmd" lm_eval --model vllm \ - --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 \ + --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192,kv_cache_dtype=${KV_CACHE_DTYPE} \ --tasks $tasks \ --batch_size $BATCH_SIZE \ $extra_args diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh index 14d004b8e8a..a53443831f0 100644 --- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh @@ -3,6 +3,7 @@ # Usage: CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-8B --dtype=mxfp8 --input_model=/models/Meta-Llama-3.1-8B-Instruct --output_model=Llama-3.1-8B-MXFP8 # Parse command line arguments +KV_CACHE_DTYPE="auto" while [[ $# -gt 0 ]]; do case $1 in --topology=*) @@ -21,6 +22,10 @@ while [[ $# -gt 0 ]]; do OUTPUT_MODEL="${1#*=}" shift ;; + --static_kv_dtype=*) + KV_CACHE_DTYPE="${1#*=}" + shift + ;; *) echo "Unknown parameter: $1" exit 1 @@ -43,7 +48,11 @@ echo " Input Model: $INPUT_MODEL" echo " Output Model: $OUTPUT_MODEL" # Set common parameters -COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round" +if [ "$KV_CACHE_DTYPE" = "auto" ]; then + COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round" +else + COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round --static_kv_dtype $KV_CACHE_DTYPE" +fi case "$TOPOLOGY" in "Llama-3.1-8B")