diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
index 74ff5e0820a..2faac68dabe 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/README.md
@@ -78,6 +78,8 @@ Notes:
 
 Here we provide several recipes for Llama3 models. The relative accuracy loss of quantized model should be less than 1%.
 
+> Note: You can also enable static quantization for KV cache by adding `--static_kv_dtype fp8` argument to `quantize.py`， or `--static_kv_dtype=fp8` argument to `run_quant.sh` and `run_benchmark.sh`.
+
 #### Llama 3.1 8B MXFP8
 
 RTN (Round-to-Nearest) is enough to keep accuracy.
@@ -110,7 +112,7 @@ CUDA_VISIBLE_DEVICES=0 python quantize.py  \
     --low_gpu_mem_usage \
     --export_format auto_round  \
     --export_path llama3.1-8B-MXFP4-MXFP8 \
-    --tasks mmlu piqa hellaswag gsm8k \
+    --tasks mmlu_llama piqa hellaswag gsm8k_llama \
     --eval_batch_size 32
 ```
 
@@ -208,8 +210,7 @@ CUDA_VISIBLE_DEVICES=0,1 bash run_benchmark.sh --model_path=Llama-3.1-70B-MXFP8
 
 The script automatically:
 - Detects available GPUs from `CUDA_VISIBLE_DEVICES` and sets `tensor_parallel_size` accordingly
-- Handles different `add_bos_token` settings for different tasks (GSM8K requires `False`, others use `True`)
-- Runs default tasks: `piqa,hellaswag,mmlu,gsm8k` with batch size 8
+- Runs default tasks: `piqa,hellaswag,mmlu_llama,gsm8k_llama` with batch size 8
 - Supports custom task selection and batch size adjustment
 
 
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
index f51fb19a8c6..feff13f9d20 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/quantize.py
@@ -162,6 +162,13 @@ def get_accuracy(model_name_or_path, tokenizer=None, eval_tasks="mmlu", limit=No
         default=[],
         help="[mix-precision] ensure that listed layers are using same data type for quantization"
     )
+    parser.add_argument(
+        "--static_kv_dtype",
+        default=None,
+        type=str,
+        choices=["fp8", "float8_e4m3fn"],
+        help="Data type for static quantize key and value.",
+    )
     parser.add_argument("--use_recipe", action="store_true", help="whether to use recipe to quantize model")
     parser.add_argument("--recipe_file", type=str, default="recipes/Meta-Llama-3.1-8B-Instruct_6bits.json", help="path of recipe file")
     parser.add_argument("--iters", default=200, type=int, help="iters for autoround.")
@@ -248,6 +255,7 @@ def load_recipe_results(file_path):
             target_bits=args.target_bits,
             options=args.options,
             shared_layers=args.shared_layers,
+            static_kv_dtype=args.static_kv_dtype,
             enable_torch_compile=args.enable_torch_compile,
             low_gpu_mem_usage=args.low_gpu_mem_usage,
             export_format=args.export_format,
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
index 6a07fbd9991..6aaa3a4da56 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_benchmark.sh
@@ -6,6 +6,7 @@
 TASKS="piqa,hellaswag,mmlu_llama,gsm8k_llama"
 BATCH_SIZE=64
 GPU_MEMORY_UTILIZATION=0.8
+KV_CACHE_DTYPE="auto"
 
 while [[ $# -gt 0 ]]; do
     case $1 in
@@ -25,6 +26,10 @@ while [[ $# -gt 0 ]]; do
             GPU_MEMORY_UTILIZATION="${1#*=}"
             shift
             ;;
+        --static_kv_dtype=*)
+            KV_CACHE_DTYPE="${1#*=}"
+            shift
+            ;;
         *)
             echo "Unknown parameter: $1"
             exit 1
@@ -32,6 +37,13 @@ while [[ $# -gt 0 ]]; do
     esac
 done
 
+# for fp8 kv cache
+if [[ "$KV_CACHE_DTYPE" == "fp8" ]]; then
+    export VLLM_FLASHINFER_DISABLE_Q_QUANTIZATION=1
+    export VLLM_ATTENTION_BACKEND="FLASHINFER"
+    echo "Using FP8 for KV cache"
+fi
+
 # Validate required parameters
 if [[ -z "$MODEL_PATH" ]]; then
     echo "Usage: bash run_benchmark.sh --model_path=<path_to_quantized_model> [--tasks=<tasks>] [--batch_size=<size>]"
@@ -75,11 +87,11 @@ run_evaluation() {
     echo "Running evaluation for tasks: $tasks (add_bos_token=$add_bos_token)"
     
     # Print the command being executed
-    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 --tasks $tasks --batch_size $BATCH_SIZE $extra_args"
+    local cmd="lm_eval --model vllm --model_args pretrained=\"$MODEL_PATH\",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192,kv_cache_dtype=${KV_CACHE_DTYPE} --tasks $tasks --batch_size $BATCH_SIZE $extra_args"
     echo "Executing command: $cmd"
     
     lm_eval --model vllm \
-        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192 \
+        --model_args pretrained="$MODEL_PATH",add_bos_token=$add_bos_token,tensor_parallel_size=$TENSOR_PARALLEL_SIZE,gpu_memory_utilization=$GPU_MEMORY_UTILIZATION,data_parallel_size=1,max_model_len=8192,kv_cache_dtype=${KV_CACHE_DTYPE} \
         --tasks $tasks \
         --batch_size $BATCH_SIZE \
         $extra_args
diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
index 14d004b8e8a..a53443831f0 100644
--- a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
+++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/auto_round/llama3/run_quant.sh
@@ -3,6 +3,7 @@
 # Usage: CUDA_VISIBLE_DEVICES=0 bash run_quant.sh --topology=Llama-3.1-8B --dtype=mxfp8 --input_model=/models/Meta-Llama-3.1-8B-Instruct --output_model=Llama-3.1-8B-MXFP8
 
 # Parse command line arguments
+KV_CACHE_DTYPE="auto"
 while [[ $# -gt 0 ]]; do
     case $1 in
         --topology=*)
@@ -21,6 +22,10 @@ while [[ $# -gt 0 ]]; do
             OUTPUT_MODEL="${1#*=}"
             shift
             ;;
+        --static_kv_dtype=*)
+            KV_CACHE_DTYPE="${1#*=}"
+            shift
+            ;;
         *)
             echo "Unknown parameter: $1"
             exit 1
@@ -43,7 +48,11 @@ echo "  Input Model: $INPUT_MODEL"
 echo "  Output Model: $OUTPUT_MODEL"
 
 # Set common parameters
-COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round"
+if [ "$KV_CACHE_DTYPE" = "auto" ]; then
+    COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round"
+else
+    COMMON_ARGS="--quantize --enable_torch_compile --low_gpu_mem_usage --export_format auto_round --static_kv_dtype $KV_CACHE_DTYPE"
+fi
 
 case "$TOPOLOGY" in
     "Llama-3.1-8B")