Xiaoxia/fp v1 (#871)

* add FP6-benchmark * update * updatefile * update tpsize --------- Co-authored-by: Ubuntu <deepspeed@DS-A100-Largedisk.3xui22esprkudm2r30sfjzexrc.bx.internal.cloudapp.net>
microsoft · Mar 8, 2024 · f415ec8 · f415ec8
1 parent 6e9ada6
commit f415ec8
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 1 deletion.
diff --git a/benchmarks/inference/mii/run_fp6.sh b/benchmarks/inference/mii/run_fp6.sh
@@ -0,0 +1,10 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+MODELS=(NousResearch/Llama-2-70b-hf)
+
+for MODEL in ${MODELS[@]}; do
+    python ./run_benchmark.py --model ${MODEL} --num_requests 128 --stream --backend fastgen --fp6  --tp_size 1
+done
diff --git a/benchmarks/inference/mii/src/server.py b/benchmarks/inference/mii/src/server.py
@@ -71,13 +71,17 @@ def start_fastgen_server(args: argparse.Namespace) -> None:
     inference_config = RaggedInferenceEngineConfig(
         tensor_parallel=tp_config, state_manager=mgr_config
     )
-
+    if args.fp6:
+        quantization_mode = 'wf6af16'
+    else:
+        quantization_mode = None
     mii.serve(
         args.model,
         deployment_name=args.deployment_name,
         tensor_parallel=args.tp_size,
         inference_engine_config=inference_config,
         replica_num=args.num_replicas,
+        quantization_mode=quantization_mode
     )
 
 

diff --git a/benchmarks/inference/mii/src/utils.py b/benchmarks/inference/mii/src/utils.py
@@ -159,6 +159,9 @@ def parse_args(
     parser.add_argument(
         "--overwrite_results", action="store_true", help="Overwrite existing results"
     )
+    parser.add_argument(
+        "--fp6", action="store_true", help="Enable FP6"
+    )
 
     # Parse arguments
     args = parser.parse_args()