[Inference] Add validated models for Gaudi (#269)

* add validated models for Gaudi * nit * fix * remove * add config * nit * remove prompt and add gpt2 * check and add all template, remove bloom-560m, add mixtral, change Qwen to version 1.5 * nit * fix * fix * fix * remove default template * fix when list length is 1 * fix * fix target * change cache dir * remove Mixtral * change to 8 cards * remove Qwen and fix * revert and add Qwen&Mixtral back * nit * add Qwen1.5-7B-Chat * add Qwen2-7B-Instruct * remove several models * add falcon qwen linear all reduce to hpu_predictor * nit --------- Signed-off-by: Yizhong Zhang <[email protected]>
intel · Jul 18, 2024 · 4a646b0 · 4a646b0
1 parent df24aa2
commit 4a646b0
Show file tree

Hide file tree

Showing 18 changed files with 174 additions and 13 deletions.
diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml
@@ -17,7 +17,7 @@ on:
         default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray'
       model_cache_path:
         type: string
-        default: '/mnt/DP_disk1/huggingface/cache'
+        default: '/scratch-2/huggingface/cache'
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-inf-gaudi2
@@ -28,16 +28,28 @@ jobs:
     name: inference
     strategy:
       matrix:
-        model: [ llama-2-7b-chat-hf, llama-2-70b-chat-hf, llama-2-7b-chat-hf-vllm ]
+        model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, mistral-7b-v0.1, mpt-7b, Qwen2-7B-Instruct, llama-2-7b-chat-hf-vllm ]
         isPR:
           - ${{inputs.ci_type == 'pr'}}
 
         exclude:
           - { isPR: true }
 
         include:
+          - { model: "bloom-7b1"}
+          - { model: "CodeLlama-7b-hf"}
+          - { model: "falcon-7b"}
+          - { model: "falcon-40b"}
+          - { model: "gemma-2b"}
+          - { model: "gpt-j-6b"}
+          - { model: "gpt2"}
           - { model: "llama-2-7b-chat-hf"}
           - { model: "llama-2-70b-chat-hf"}
+          - { model: "meta-llama-3-8b-instruct"}
+          - { model: "meta-llama-3-70b-instruct"}
+          - { model: "mpt-7b"}
+          - { model: "mistral-7b-v0.1"}
+          - { model: "Qwen2-7B-Instruct"}
           - { model: "llama-2-7b-chat-hf-vllm"}
 
     runs-on: gaudi2
@@ -60,12 +72,10 @@ jobs:
         id: "target"
         run: |
           target="inference"
-          if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then
-            target="${target}_gaudi2"
-          elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
-            target="${target}_gaudi2"
-          elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
+          if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
             target="${target}_vllm_gaudi2"
+          else
+            target="${target}_gaudi2"
           fi
           echo "target is ${target}"
           echo "target=$target" >> $GITHUB_OUTPUT
@@ -105,11 +115,8 @@ jobs:
           TARGET=${{steps.target.outputs.target}}
           CMD=$(cat << EOF
           import yaml
-          if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"):
-              conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml"
-          elif ("${{ matrix.model }}" == "llama-2-70b-chat-hf"):
-              conf_path = "llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml"
-          elif ("${{ matrix.model }}" == "llama-2-7b-chat-hf-vllm"):   
+          conf_path = "llm_on_ray/inference/models/hpu/" + "${{ matrix.model }}" + "-hpu.yaml"
+          if ("${{ matrix.model }}" == "llama-2-7b-chat-hf-vllm"):   
               conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml"
           with open(conf_path, encoding="utf-8") as reader:
               result = yaml.load(reader, Loader=yaml.FullLoader)
@@ -124,6 +131,8 @@ jobs:
             docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --keep_serve_terminal"
           elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
             docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml --keep_serve_terminal" 
+          else
+            docker exec "${TARGET}" bash -c "llm_on_ray-serve  --config_file llm_on_ray/inference/models/hpu/${{ matrix.model }}-hpu.yaml --keep_serve_terminal"
           fi
           echo Streaming query:
           docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }} --streaming_response"

diff --git a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
@@ -0,0 +1,13 @@
+port: 8000
+name: CodeLlama-7b-hf
+route_prefix: /CodeLlama-7b-hf
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: codellama/CodeLlama-7b-hf
+  tokenizer_name_or_path: codellama/CodeLlama-7b-hf
+  chat_template: "llm_on_ray/inference/models/templates/template_codellama.jinja"
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml
@@ -0,0 +1,12 @@
+port: 8000
+name: Qwen2-7B-Instruct
+route_prefix: /Qwen2-7B-Instruct
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: Qwen/Qwen2-7B-Instruct
+  tokenizer_name_or_path: Qwen/Qwen2-7B-Instruct
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
@@ -0,0 +1,12 @@
+port: 8000
+name: bloom-7b1
+route_prefix: /bloom-7b1
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: bigscience/bloom-7b1
+  tokenizer_name_or_path: bigscience/bloom-7b1
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
@@ -0,0 +1,14 @@
+port: 8000
+name: falcon-40b
+route_prefix: /falcon-40b
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+deepspeed: true
+workers_per_group: 8
+device: hpu
+model_description:
+  model_id_or_path: tiiuae/falcon-40b
+  tokenizer_name_or_path: tiiuae/falcon-40b
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
@@ -0,0 +1,12 @@
+port: 8000
+name: falcon-7b
+route_prefix: /falcon-7b
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: tiiuae/falcon-7b
+  tokenizer_name_or_path: tiiuae/falcon-7b
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml
@@ -0,0 +1,13 @@
+port: 8000
+name: gemma-2b
+route_prefix: /gemma-2b
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: google/gemma-2b
+  tokenizer_name_or_path: google/gemma-2b
+  chat_template: "llm_on_ray/inference/models/templates/template_gemma.jinja"
+  config:
+    use_auth_token: ' '
diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
@@ -0,0 +1,13 @@
+port: 8000
+name: gpt-j-6b
+route_prefix: /gpt-j-6b
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: EleutherAI/gpt-j-6b
+  tokenizer_name_or_path: EleutherAI/gpt-j-6b
+  gpt_base_model: true
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
@@ -0,0 +1,14 @@
+port: 8000
+name: gpt2
+route_prefix: /gpt2
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: gpt2
+  tokenizer_name_or_path: gpt2
+  chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
+  gpt_base_model: true
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml
@@ -10,5 +10,6 @@ device: hpu
 model_description:
   model_id_or_path: NousResearch/Llama-2-70b-chat-hf
   tokenizer_name_or_path: NousResearch/Llama-2-70b-chat-hf
+  chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja"
   config:
     use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml
@@ -8,5 +8,6 @@ device: hpu
 model_description:
   model_id_or_path: NousResearch/Llama-2-7b-chat-hf
   tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf
+  chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja"
   config:
     use_auth_token: ''
diff --git a/...e/models/hpu/llama-3-70b-chat-hf-hpu.yaml → ...ls/hpu/meta-llama-3-70b-instruct-hpu.yaml b/...e/models/hpu/llama-3-70b-chat-hf-hpu.yaml → ...ls/hpu/meta-llama-3-70b-instruct-hpu.yaml
diff --git a/...e/models/hpu/llama-3-8b-instruct-hpu.yaml → ...els/hpu/meta-llama-3-8b-instruct-hpu.yaml b/...e/models/hpu/llama-3-8b-instruct-hpu.yaml → ...els/hpu/meta-llama-3-8b-instruct-hpu.yaml
diff --git a/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml
@@ -0,0 +1,13 @@
+port: 8000
+name: mistral-7b-v0.1
+route_prefix: /mistral-7b-v0.1
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: mistralai/Mistral-7B-v0.1
+  tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
+  chat_template: "llm_on_ray/inference/models/templates/template_mistral.jinja"
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
@@ -0,0 +1,13 @@
+port: 8000
+name: mpt-7b
+route_prefix: /mpt-7b
+num_replicas: 1
+cpus_per_worker: 8
+hpus_per_worker: 1
+device: hpu
+model_description:
+  model_id_or_path: EleutherAI/gpt-neox-20b
+  tokenizer_name_or_path: EleutherAI/gpt-neox-20b
+  config:
+    use_auth_token: ''
+    trust_remote_code: true
diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml
@@ -0,0 +1,19 @@
+port: 8000
+name: neural-chat-7b-v3-3
+route_prefix: /neural-chat-7b-v3-3
+num_replicas: 1
+cpus_per_worker: 0
+gpus_per_worker: 0
+hpus_per_worker: 1
+deepspeed: false
+workers_per_group: 2
+device: hpu
+ipex:
+  enabled: false
+  precision: bf16
+model_description:
+  model_id_or_path: Intel/neural-chat-7b-v3-3
+  tokenizer_name_or_path: Intel/neural-chat-7b-v3-3
+  chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja"
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
@@ -15,3 +15,5 @@ model_description:
   model_id_or_path: Intel/neural-chat-7b-v3-3
   tokenizer_name_or_path: Intel/neural-chat-7b-v3-3
   chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja"
+  config:
+    use_auth_token: ''
diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py
@@ -341,7 +341,7 @@ def load_model(self):
         engine = deepspeed.init_inference(model, **ds_inference_kwargs)
         self.model = engine.module
 
-        if self.model.config.model_type == "llama":
+        if self.model.config.model_type in ["llama", "falcon", "qwen2"]:
 
             def patch_scoped_linear_all_reduce(model):
                 from deepspeed.module_inject.layers import LinearAllreduce