Skip to content

Commit

Permalink
[Inference] Add validated models for Gaudi (#269)
Browse files Browse the repository at this point in the history
* add validated models for Gaudi

* nit

* fix

* remove

* add config

* nit

* remove prompt and add gpt2

* check and add all template, remove bloom-560m, add mixtral, change Qwen to version 1.5

* nit

* fix

* fix

* fix

* remove default template

* fix when list length is 1

* fix

* fix target

* change cache dir

* remove Mixtral

* change to 8 cards

* remove Qwen and fix

* revert and add Qwen&Mixtral back

* nit

* add Qwen1.5-7B-Chat

* add Qwen2-7B-Instruct

* remove several models

* add falcon qwen linear all reduce to hpu_predictor

* nit

---------

Signed-off-by: Yizhong Zhang <[email protected]>
  • Loading branch information
Deegue authored Jul 18, 2024
1 parent df24aa2 commit 4a646b0
Show file tree
Hide file tree
Showing 18 changed files with 174 additions and 13 deletions.
33 changes: 21 additions & 12 deletions .github/workflows/workflow_inference_gaudi2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ on:
default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray'
model_cache_path:
type: string
default: '/mnt/DP_disk1/huggingface/cache'
default: '/scratch-2/huggingface/cache'

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-inf-gaudi2
Expand All @@ -28,16 +28,28 @@ jobs:
name: inference
strategy:
matrix:
model: [ llama-2-7b-chat-hf, llama-2-70b-chat-hf, llama-2-7b-chat-hf-vllm ]
model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, mistral-7b-v0.1, mpt-7b, Qwen2-7B-Instruct, llama-2-7b-chat-hf-vllm ]
isPR:
- ${{inputs.ci_type == 'pr'}}

exclude:
- { isPR: true }

include:
- { model: "bloom-7b1"}
- { model: "CodeLlama-7b-hf"}
- { model: "falcon-7b"}
- { model: "falcon-40b"}
- { model: "gemma-2b"}
- { model: "gpt-j-6b"}
- { model: "gpt2"}
- { model: "llama-2-7b-chat-hf"}
- { model: "llama-2-70b-chat-hf"}
- { model: "meta-llama-3-8b-instruct"}
- { model: "meta-llama-3-70b-instruct"}
- { model: "mpt-7b"}
- { model: "mistral-7b-v0.1"}
- { model: "Qwen2-7B-Instruct"}
- { model: "llama-2-7b-chat-hf-vllm"}

runs-on: gaudi2
Expand All @@ -60,12 +72,10 @@ jobs:
id: "target"
run: |
target="inference"
if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then
target="${target}_gaudi2"
elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then
target="${target}_gaudi2"
elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
target="${target}_vllm_gaudi2"
else
target="${target}_gaudi2"
fi
echo "target is ${target}"
echo "target=$target" >> $GITHUB_OUTPUT
Expand Down Expand Up @@ -105,11 +115,8 @@ jobs:
TARGET=${{steps.target.outputs.target}}
CMD=$(cat << EOF
import yaml
if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"):
conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml"
elif ("${{ matrix.model }}" == "llama-2-70b-chat-hf"):
conf_path = "llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml"
elif ("${{ matrix.model }}" == "llama-2-7b-chat-hf-vllm"):
conf_path = "llm_on_ray/inference/models/hpu/" + "${{ matrix.model }}" + "-hpu.yaml"
if ("${{ matrix.model }}" == "llama-2-7b-chat-hf-vllm"):
conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml"
with open(conf_path, encoding="utf-8") as reader:
result = yaml.load(reader, Loader=yaml.FullLoader)
Expand All @@ -124,6 +131,8 @@ jobs:
docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --keep_serve_terminal"
elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then
docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml --keep_serve_terminal"
else
docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/${{ matrix.model }}-hpu.yaml --keep_serve_terminal"
fi
echo Streaming query:
docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }} --streaming_response"
Expand Down
13 changes: 13 additions & 0 deletions llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
port: 8000
name: CodeLlama-7b-hf
route_prefix: /CodeLlama-7b-hf
num_replicas: 1
cpus_per_worker: 8
hpus_per_worker: 1
device: hpu
model_description:
model_id_or_path: codellama/CodeLlama-7b-hf
tokenizer_name_or_path: codellama/CodeLlama-7b-hf
chat_template: "llm_on_ray/inference/models/templates/template_codellama.jinja"
config:
use_auth_token: ''
12 changes: 12 additions & 0 deletions llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
port: 8000
name: Qwen2-7B-Instruct
route_prefix: /Qwen2-7B-Instruct
num_replicas: 1
cpus_per_worker: 8
hpus_per_worker: 1
device: hpu
model_description:
model_id_or_path: Qwen/Qwen2-7B-Instruct
tokenizer_name_or_path: Qwen/Qwen2-7B-Instruct
config:
use_auth_token: ''
12 changes: 12 additions & 0 deletions llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
port: 8000
name: bloom-7b1
route_prefix: /bloom-7b1
num_replicas: 1
cpus_per_worker: 8
hpus_per_worker: 1
device: hpu
model_description:
model_id_or_path: bigscience/bloom-7b1
tokenizer_name_or_path: bigscience/bloom-7b1
config:
use_auth_token: ''
14 changes: 14 additions & 0 deletions llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
port: 8000
name: falcon-40b
route_prefix: /falcon-40b
num_replicas: 1
cpus_per_worker: 8
hpus_per_worker: 1
deepspeed: true
workers_per_group: 8
device: hpu
model_description:
model_id_or_path: tiiuae/falcon-40b
tokenizer_name_or_path: tiiuae/falcon-40b
config:
use_auth_token: ''
12 changes: 12 additions & 0 deletions llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
port: 8000
name: falcon-7b
route_prefix: /falcon-7b
num_replicas: 1
cpus_per_worker: 8
hpus_per_worker: 1
device: hpu
model_description:
model_id_or_path: tiiuae/falcon-7b
tokenizer_name_or_path: tiiuae/falcon-7b
config:
use_auth_token: ''
13 changes: 13 additions & 0 deletions llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
port: 8000
name: gemma-2b
route_prefix: /gemma-2b
num_replicas: 1
cpus_per_worker: 8
hpus_per_worker: 1
device: hpu
model_description:
model_id_or_path: google/gemma-2b
tokenizer_name_or_path: google/gemma-2b
chat_template: "llm_on_ray/inference/models/templates/template_gemma.jinja"
config:
use_auth_token: ' '
13 changes: 13 additions & 0 deletions llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
port: 8000
name: gpt-j-6b
route_prefix: /gpt-j-6b
num_replicas: 1
cpus_per_worker: 8
hpus_per_worker: 1
device: hpu
model_description:
model_id_or_path: EleutherAI/gpt-j-6b
tokenizer_name_or_path: EleutherAI/gpt-j-6b
gpt_base_model: true
config:
use_auth_token: ''
14 changes: 14 additions & 0 deletions llm_on_ray/inference/models/hpu/gpt2-hpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
port: 8000
name: gpt2
route_prefix: /gpt2
num_replicas: 1
cpus_per_worker: 8
hpus_per_worker: 1
device: hpu
model_description:
model_id_or_path: gpt2
tokenizer_name_or_path: gpt2
chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja"
gpt_base_model: true
config:
use_auth_token: ''
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,6 @@ device: hpu
model_description:
model_id_or_path: NousResearch/Llama-2-70b-chat-hf
tokenizer_name_or_path: NousResearch/Llama-2-70b-chat-hf
chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja"
config:
use_auth_token: ''
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,6 @@ device: hpu
model_description:
model_id_or_path: NousResearch/Llama-2-7b-chat-hf
tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf
chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja"
config:
use_auth_token: ''
13 changes: 13 additions & 0 deletions llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
port: 8000
name: mistral-7b-v0.1
route_prefix: /mistral-7b-v0.1
num_replicas: 1
cpus_per_worker: 8
hpus_per_worker: 1
device: hpu
model_description:
model_id_or_path: mistralai/Mistral-7B-v0.1
tokenizer_name_or_path: mistralai/Mistral-7B-v0.1
chat_template: "llm_on_ray/inference/models/templates/template_mistral.jinja"
config:
use_auth_token: ''
13 changes: 13 additions & 0 deletions llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
port: 8000
name: mpt-7b
route_prefix: /mpt-7b
num_replicas: 1
cpus_per_worker: 8
hpus_per_worker: 1
device: hpu
model_description:
model_id_or_path: EleutherAI/gpt-neox-20b
tokenizer_name_or_path: EleutherAI/gpt-neox-20b
config:
use_auth_token: ''
trust_remote_code: true
19 changes: 19 additions & 0 deletions llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
port: 8000
name: neural-chat-7b-v3-3
route_prefix: /neural-chat-7b-v3-3
num_replicas: 1
cpus_per_worker: 0
gpus_per_worker: 0
hpus_per_worker: 1
deepspeed: false
workers_per_group: 2
device: hpu
ipex:
enabled: false
precision: bf16
model_description:
model_id_or_path: Intel/neural-chat-7b-v3-3
tokenizer_name_or_path: Intel/neural-chat-7b-v3-3
chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja"
config:
use_auth_token: ''
2 changes: 2 additions & 0 deletions llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ model_description:
model_id_or_path: Intel/neural-chat-7b-v3-3
tokenizer_name_or_path: Intel/neural-chat-7b-v3-3
chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja"
config:
use_auth_token: ''
2 changes: 1 addition & 1 deletion llm_on_ray/inference/predictors/hpu_predictor.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,7 +341,7 @@ def load_model(self):
engine = deepspeed.init_inference(model, **ds_inference_kwargs)
self.model = engine.module

if self.model.config.model_type == "llama":
if self.model.config.model_type in ["llama", "falcon", "qwen2"]:

def patch_scoped_linear_all_reduce(model):
from deepspeed.module_inject.layers import LinearAllreduce
Expand Down

0 comments on commit 4a646b0

Please sign in to comment.