diff --git a/.github/workflows/workflow_inference_gaudi2.yml b/.github/workflows/workflow_inference_gaudi2.yml index dedeb415..6abd0381 100644 --- a/.github/workflows/workflow_inference_gaudi2.yml +++ b/.github/workflows/workflow_inference_gaudi2.yml @@ -17,7 +17,7 @@ on: default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray' model_cache_path: type: string - default: '/mnt/DP_disk1/huggingface/cache' + default: '/scratch-2/huggingface/cache' concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-inf-gaudi2 @@ -28,7 +28,7 @@ jobs: name: inference strategy: matrix: - model: [ llama-2-7b-chat-hf, llama-2-70b-chat-hf, llama-2-7b-chat-hf-vllm ] + model: [ bloom-7b1, CodeLlama-7b-hf, falcon-7b, falcon-40b, gemma-2b, gpt-j-6b, gpt2, llama-2-7b-chat-hf, llama-2-70b-chat-hf, meta-llama-3-8b-instruct, meta-llama-3-70b-instruct, mistral-7b-v0.1, mpt-7b, Qwen2-7B-Instruct, llama-2-7b-chat-hf-vllm ] isPR: - ${{inputs.ci_type == 'pr'}} @@ -36,8 +36,20 @@ jobs: - { isPR: true } include: + - { model: "bloom-7b1"} + - { model: "CodeLlama-7b-hf"} + - { model: "falcon-7b"} + - { model: "falcon-40b"} + - { model: "gemma-2b"} + - { model: "gpt-j-6b"} + - { model: "gpt2"} - { model: "llama-2-7b-chat-hf"} - { model: "llama-2-70b-chat-hf"} + - { model: "meta-llama-3-8b-instruct"} + - { model: "meta-llama-3-70b-instruct"} + - { model: "mpt-7b"} + - { model: "mistral-7b-v0.1"} + - { model: "Qwen2-7B-Instruct"} - { model: "llama-2-7b-chat-hf-vllm"} runs-on: gaudi2 @@ -60,12 +72,10 @@ jobs: id: "target" run: | target="inference" - if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf" ]]; then - target="${target}_gaudi2" - elif [[ ${{ matrix.model }} == "llama-2-70b-chat-hf" ]]; then - target="${target}_gaudi2" - elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then + if [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then target="${target}_vllm_gaudi2" + else + target="${target}_gaudi2" fi echo "target is ${target}" echo "target=$target" >> $GITHUB_OUTPUT @@ -105,11 +115,8 @@ jobs: TARGET=${{steps.target.outputs.target}} CMD=$(cat << EOF import yaml - if ("${{ matrix.model }}" == "llama-2-7b-chat-hf"): - conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml" - elif ("${{ matrix.model }}" == "llama-2-70b-chat-hf"): - conf_path = "llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml" - elif ("${{ matrix.model }}" == "llama-2-7b-chat-hf-vllm"): + conf_path = "llm_on_ray/inference/models/hpu/" + "${{ matrix.model }}" + "-hpu.yaml" + if ("${{ matrix.model }}" == "llama-2-7b-chat-hf-vllm"): conf_path = "llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml" with open(conf_path, encoding="utf-8") as reader: result = yaml.load(reader, Loader=yaml.FullLoader) @@ -124,6 +131,8 @@ jobs: docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml --keep_serve_terminal" elif [[ ${{ matrix.model }} == "llama-2-7b-chat-hf-vllm" ]]; then docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-vllm-hpu.yaml --keep_serve_terminal" + else + docker exec "${TARGET}" bash -c "llm_on_ray-serve --config_file llm_on_ray/inference/models/hpu/${{ matrix.model }}-hpu.yaml --keep_serve_terminal" fi echo Streaming query: docker exec "${TARGET}" bash -c "python examples/inference/api_server_openai/query_http_requests.py --model_name ${{ matrix.model }} --streaming_response" diff --git a/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml new file mode 100644 index 00000000..71762756 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/CodeLlama-7b-hf-hpu.yaml @@ -0,0 +1,13 @@ +port: 8000 +name: CodeLlama-7b-hf +route_prefix: /CodeLlama-7b-hf +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: codellama/CodeLlama-7b-hf + tokenizer_name_or_path: codellama/CodeLlama-7b-hf + chat_template: "llm_on_ray/inference/models/templates/template_codellama.jinja" + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml new file mode 100644 index 00000000..09f705f1 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/Qwen2-7B-Instruct-hpu.yaml @@ -0,0 +1,12 @@ +port: 8000 +name: Qwen2-7B-Instruct +route_prefix: /Qwen2-7B-Instruct +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: Qwen/Qwen2-7B-Instruct + tokenizer_name_or_path: Qwen/Qwen2-7B-Instruct + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml new file mode 100644 index 00000000..5de2a3e3 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/bloom-7b1-hpu.yaml @@ -0,0 +1,12 @@ +port: 8000 +name: bloom-7b1 +route_prefix: /bloom-7b1 +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: bigscience/bloom-7b1 + tokenizer_name_or_path: bigscience/bloom-7b1 + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml new file mode 100644 index 00000000..dd6bd2ac --- /dev/null +++ b/llm_on_ray/inference/models/hpu/falcon-40b-hpu.yaml @@ -0,0 +1,14 @@ +port: 8000 +name: falcon-40b +route_prefix: /falcon-40b +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +deepspeed: true +workers_per_group: 8 +device: hpu +model_description: + model_id_or_path: tiiuae/falcon-40b + tokenizer_name_or_path: tiiuae/falcon-40b + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml new file mode 100644 index 00000000..e21110c0 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/falcon-7b-hpu.yaml @@ -0,0 +1,12 @@ +port: 8000 +name: falcon-7b +route_prefix: /falcon-7b +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: tiiuae/falcon-7b + tokenizer_name_or_path: tiiuae/falcon-7b + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml new file mode 100644 index 00000000..3b7e6d58 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/gemma-2b-hpu.yaml @@ -0,0 +1,13 @@ +port: 8000 +name: gemma-2b +route_prefix: /gemma-2b +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: google/gemma-2b + tokenizer_name_or_path: google/gemma-2b + chat_template: "llm_on_ray/inference/models/templates/template_gemma.jinja" + config: + use_auth_token: ' ' diff --git a/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml new file mode 100644 index 00000000..8260a644 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/gpt-j-6b-hpu.yaml @@ -0,0 +1,13 @@ +port: 8000 +name: gpt-j-6b +route_prefix: /gpt-j-6b +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: EleutherAI/gpt-j-6b + tokenizer_name_or_path: EleutherAI/gpt-j-6b + gpt_base_model: true + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml new file mode 100644 index 00000000..b25903cf --- /dev/null +++ b/llm_on_ray/inference/models/hpu/gpt2-hpu.yaml @@ -0,0 +1,14 @@ +port: 8000 +name: gpt2 +route_prefix: /gpt2 +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: gpt2 + tokenizer_name_or_path: gpt2 + chat_template: "llm_on_ray/inference/models/templates/template_gpt2.jinja" + gpt_base_model: true + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml index 4ecf45cd..27b1d9c2 100644 --- a/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-2-70b-chat-hf-hpu.yaml @@ -10,5 +10,6 @@ device: hpu model_description: model_id_or_path: NousResearch/Llama-2-70b-chat-hf tokenizer_name_or_path: NousResearch/Llama-2-70b-chat-hf + chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml index cb57f276..f25b3101 100644 --- a/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml +++ b/llm_on_ray/inference/models/hpu/llama-2-7b-chat-hf-hpu.yaml @@ -8,5 +8,6 @@ device: hpu model_description: model_id_or_path: NousResearch/Llama-2-7b-chat-hf tokenizer_name_or_path: NousResearch/Llama-2-7b-chat-hf + chat_template: "llm_on_ray/inference/models/templates/template_llama2.jinja" config: use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml similarity index 100% rename from llm_on_ray/inference/models/hpu/llama-3-70b-chat-hf-hpu.yaml rename to llm_on_ray/inference/models/hpu/meta-llama-3-70b-instruct-hpu.yaml diff --git a/llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml b/llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml similarity index 100% rename from llm_on_ray/inference/models/hpu/llama-3-8b-instruct-hpu.yaml rename to llm_on_ray/inference/models/hpu/meta-llama-3-8b-instruct-hpu.yaml diff --git a/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml new file mode 100644 index 00000000..738ad49b --- /dev/null +++ b/llm_on_ray/inference/models/hpu/mistral-7b-v0.1-hpu.yaml @@ -0,0 +1,13 @@ +port: 8000 +name: mistral-7b-v0.1 +route_prefix: /mistral-7b-v0.1 +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: mistralai/Mistral-7B-v0.1 + tokenizer_name_or_path: mistralai/Mistral-7B-v0.1 + chat_template: "llm_on_ray/inference/models/templates/template_mistral.jinja" + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml new file mode 100644 index 00000000..df490837 --- /dev/null +++ b/llm_on_ray/inference/models/hpu/mpt-7b-hpu.yaml @@ -0,0 +1,13 @@ +port: 8000 +name: mpt-7b +route_prefix: /mpt-7b +num_replicas: 1 +cpus_per_worker: 8 +hpus_per_worker: 1 +device: hpu +model_description: + model_id_or_path: EleutherAI/gpt-neox-20b + tokenizer_name_or_path: EleutherAI/gpt-neox-20b + config: + use_auth_token: '' + trust_remote_code: true diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml new file mode 100644 index 00000000..1973ae1a --- /dev/null +++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3-hpu.yaml @@ -0,0 +1,19 @@ +port: 8000 +name: neural-chat-7b-v3-3 +route_prefix: /neural-chat-7b-v3-3 +num_replicas: 1 +cpus_per_worker: 0 +gpus_per_worker: 0 +hpus_per_worker: 1 +deepspeed: false +workers_per_group: 2 +device: hpu +ipex: + enabled: false + precision: bf16 +model_description: + model_id_or_path: Intel/neural-chat-7b-v3-3 + tokenizer_name_or_path: Intel/neural-chat-7b-v3-3 + chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja" + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml index 64566a6d..1973ae1a 100644 --- a/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml +++ b/llm_on_ray/inference/models/hpu/neural-chat-7b-v3-3.yaml @@ -15,3 +15,5 @@ model_description: model_id_or_path: Intel/neural-chat-7b-v3-3 tokenizer_name_or_path: Intel/neural-chat-7b-v3-3 chat_template: "llm_on_ray/inference/models/templates/template_neuralchat.jinja" + config: + use_auth_token: '' diff --git a/llm_on_ray/inference/predictors/hpu_predictor.py b/llm_on_ray/inference/predictors/hpu_predictor.py index 5e19c873..30f8770c 100644 --- a/llm_on_ray/inference/predictors/hpu_predictor.py +++ b/llm_on_ray/inference/predictors/hpu_predictor.py @@ -341,7 +341,7 @@ def load_model(self): engine = deepspeed.init_inference(model, **ds_inference_kwargs) self.model = engine.module - if self.model.config.model_type == "llama": + if self.model.config.model_type in ["llama", "falcon", "qwen2"]: def patch_scoped_linear_all_reduce(model): from deepspeed.module_inject.layers import LinearAllreduce