Merge branch 'main' into support_resize_token_embeddings_for_tp

huggingface · Sep 30, 2024 · a77316e · a77316e
2 parents 1f89c9f + 1ef1e12
commit a77316e
Show file tree

Hide file tree

Showing 82 changed files with 1,030 additions and 548 deletions.
diff --git a/.github/workflows/build-neuronx-tgi.yml b/.github/workflows/build-neuronx-tgi.yml
@@ -0,0 +1,53 @@
+name: Build and push NeuronX docker image to ghcr.io
+
+on:
+  workflow_dispatch:
+  push:
+    tags:
+      - '*'
+
+jobs:
+  docker:
+    runs-on:
+      group: aws-general-8-plus
+    permissions:
+      contents: write
+      packages: write
+    steps:
+      -
+        name: Checkout sources
+        uses: actions/checkout@v4
+      -
+        name: Install python and create venv
+        run: |
+          sudo apt update
+          sudo apt install python3-venv python3-dev -y
+          python3 -m venv aws_neuron_venv_pytorch
+          source aws_neuron_venv_pytorch/bin/activate
+          python -m pip install -U pip
+          python -m pip install build
+          python -m pip config set global.extra-index-url https://pip.repos.neuron.amazonaws.com
+          python -m build .
+      -
+        name: Extract version
+        run: |
+          pkg=$(ls dist/optimum_neuron*.tar.gz); tmp=${pkg#*-}; echo "ON_VERSION=${tmp%.tar.gz*}">> $GITHUB_ENV
+      -
+        name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+      -
+        name: Login to ghcr.io
+        uses: docker/login-action@v3
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      -
+        name: Build and push
+        uses: docker/build-push-action@v6
+        with:
+          context: .
+          file: text-generation-inference/Dockerfile
+          push: true
+          build-args: VERSION=${{ env.ON_VERSION }}
+          tags: ghcr.io/huggingface/neuronx-tgi:${{ env.ON_VERSION }}, ghcr.io/huggingface/neuronx-tgi:latest
diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
@@ -51,7 +51,14 @@ jobs:
       - name: Make documentation
         shell: bash
         run: |
-          doc-builder build optimum.neuron docs/source/ --repo_name optimum-neuron --build_dir neuron-doc-build/ --version ${{ env.VERSION }} --version_tag_suffix "" --html --clean
+          doc-builder build optimum.neuron docs/source/ \
+            --repo_name optimum-neuron \
+            --build_dir neuron-doc-build/ \
+            --version ${{ env.VERSION }} \
+            --version_tag_suffix "" \
+            --html \
+            --clean \
+            --notebook_dir docs/notebooks/
           cd  neuron-doc-build/
           mv optimum.neuron optimum-neuron
-          doc-builder push optimum-neuron --doc_build_repo_id "hf-doc-build/doc-build" --token "${{ secrets.HF_DOC_BUILD_PUSH }}" --commit_msg "Updated with commit $COMMIT_SHA See: https://github.com/huggingface/optimum-neuron/commit/$COMMIT_SHA" --n_retries 5
+          doc-builder push optimum-neuron --doc_build_repo_id "hf-doc-build/doc-build" --token "${{ secrets.HF_DOC_BUILD_PUSH }}" --commit_msg "Updated with commit $COMMIT_SHA See: https://github.com/huggingface/optimum-neuron/commit/$COMMIT_SHA" --n_retries 5
diff --git a/.github/workflows/doc-pr-build.yml b/.github/workflows/doc-pr-build.yml
@@ -36,7 +36,14 @@ jobs:
       - name: Make documentation
         shell: bash
         run: |
-          doc-builder build optimum.neuron docs/source/ --repo_name optimum-neuron --build_dir neuron-doc-build/ --version pr_${{ env.PR_NUMBER }} --version_tag_suffix "" --html --clean
+          doc-builder build optimum.neuron docs/source/ \
+            --repo_name optimum-neuron \
+            --build_dir neuron-doc-build/ \
+            --version pr_${{ env.PR_NUMBER }} \
+            --version_tag_suffix "" \
+            --html \
+            --clean \
+            --notebook_dir docs/notebooks/
 
       - name: Save commit_sha & pr_number
         run: |

diff --git a/.github/workflows/inference_cache_llm.yml b/.github/workflows/inference_cache_llm.yml
@@ -21,9 +21,9 @@ jobs:
       matrix:
         config: [
           gpt2,
-          llama3-8b,
+          llama,
+          llama3.1-70b,
           llama3-70b,
-          llama2-7b-13b,
           llama2-70b,
           mistral,
           llama-variants,
@@ -39,7 +39,7 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.18.3.0 aws-neuronx-runtime-lib=2.21.41.0-fb1705f5f aws-neuronx-collectives=2.21.46.0-69b77134b  -y
+          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.14.0-6e27b8d5b aws-neuronx-collectives=2.22.26.0-17a033bc8  -y
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v4

diff --git a/.github/workflows/test_inf1_export.yml b/.github/workflows/test_inf1_export.yml
@@ -21,7 +21,7 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          sudo apt-get install aws-neuronx-tools=2.18.3.0 aws-neuronx-runtime-lib=2.21.41.0-fb1705f5f aws-neuronx-collectives=2.21.46.0-69b77134b  -y
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v2

diff --git a/.github/workflows/test_inf2.yml b/.github/workflows/test_inf2.yml
@@ -32,7 +32,7 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.14.0-6e27b8d5b aws-neuronx-collectives=2.22.26.0-17a033bc8  -y
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v2

diff --git a/.github/workflows/test_inf2_export.yml b/.github/workflows/test_inf2_export.yml
@@ -32,7 +32,7 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.14.0-6e27b8d5b aws-neuronx-collectives=2.22.26.0-17a033bc8  -y
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v2

diff --git a/.github/workflows/test_inf2_full_export.yml b/.github/workflows/test_inf2_full_export.yml
@@ -30,7 +30,7 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.14.0-6e27b8d5b aws-neuronx-collectives=2.22.26.0-17a033bc8  -y
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v2

diff --git a/.github/workflows/test_inf2_inference.yml b/.github/workflows/test_inf2_inference.yml
@@ -32,8 +32,11 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.14.0-6e27b8d5b aws-neuronx-collectives=2.22.26.0-17a033bc8  -y
           export PATH=/opt/aws/neuron/bin:$PATH
+      - name: Install cv2 dependencies
+        run: |
+          sudo apt-get install ffmpeg libsm6 libxext6  -y
       - name: Checkout
         uses: actions/checkout@v2
       - name: Install python dependencies

diff --git a/.github/workflows/test_inf2_tgi.yml b/.github/workflows/test_inf2_tgi.yml
@@ -34,7 +34,7 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.14.0-6e27b8d5b aws-neuronx-collectives=2.22.26.0-17a033bc8  -y
           export PATH=/opt/aws/neuron/bin:$PATH
       - name: Checkout
         uses: actions/checkout@v2

diff --git a/.github/workflows/test_trainium_common.yml b/.github/workflows/test_trainium_common.yml
@@ -34,8 +34,11 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.14.0-6e27b8d5b aws-neuronx-collectives=2.22.26.0-17a033bc8  -y
           export PATH=/opt/aws/neuron/bin:$PATH
+      - name: Install cv2 dependencies
+        run: |
+          sudo apt-get install ffmpeg libsm6 libxext6  -y
       - name: Checkout
         uses: actions/checkout@v2
       - name: Install python dependencies

diff --git a/.github/workflows/test_trainium_distributed.yml b/.github/workflows/test_trainium_distributed.yml
@@ -33,8 +33,11 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.14.0-6e27b8d5b aws-neuronx-collectives=2.22.26.0-17a033bc8  -y
           export PATH=/opt/aws/neuron/bin:$PATH
+      - name: Install cv2 dependencies
+        run: |
+          sudo apt-get install ffmpeg libsm6 libxext6  -y
       - name: Checkout
         uses: actions/checkout@v2
       - name: Setup PATH

diff --git a/.github/workflows/test_trainium_examples.yml b/.github/workflows/test_trainium_examples.yml
@@ -41,8 +41,11 @@ jobs:
           EOF
           wget -qO - https://apt.repos.neuron.amazonaws.com/GPG-PUB-KEY-AMAZON-AWS-NEURON.PUB | sudo apt-key add -
           sudo apt-get update -y
-          sudo apt-get install aws-neuronx-tools=2.17.1.0 aws-neuronx-runtime-lib=2.20.22.0-1b3ca6425 aws-neuronx-collectives=2.20.22.0-c101c322e  -y
+          sudo apt-get install aws-neuronx-tools=2.19.0.0 aws-neuronx-runtime-lib=2.22.14.0-6e27b8d5b aws-neuronx-collectives=2.22.26.0-17a033bc8  -y
           export PATH=/opt/aws/neuron/bin:$PATH
+      - name: Install cv2 dependencies
+        run: |
+          sudo apt-get install ffmpeg libsm6 libxext6  -y
       - name: Checkout
         uses: actions/checkout@v2
       - name: Setup PATH

diff --git a/Makefile b/Makefile
@@ -40,19 +40,15 @@ PACKAGE_FILES = $(PACKAGE_PYTHON_FILES)  \
 $(PACKAGE_DIST) $(PACKAGE_WHEEL): $(PACKAGE_FILES)
 	python -m build
 
-TGI_VERSION ?= 2.1.1
-
 neuronx-tgi: $(PACKAGE_DIST)
 	docker build --rm -f text-generation-inference/Dockerfile \
 	             --build-arg VERSION=$(VERSION) \
-	             --build-arg TGI_VERSION=$(TGI_VERSION) \
 				 -t neuronx-tgi:$(VERSION) .
 	docker tag neuronx-tgi:$(VERSION) neuronx-tgi:latest
 
 neuronx-tgi-sagemaker: $(PACKAGE_DIST)
 	docker build --rm -f text-generation-inference/Dockerfile \
 	             --build-arg VERSION=$(VERSION) \
-	             --build-arg TGI_VERSION=$(TGI_VERSION) \
 				 --target sagemaker \
 				 -t neuronx-tgi:$(VERSION) .
 
@@ -90,7 +86,7 @@ test_installs:
 tgi_server:
 	python -m pip install -r text-generation-inference/server/build-requirements.txt
 	make -C text-generation-inference/server clean
-	VERSION=${VERSION} TGI_VERSION=${TGI_VERSION} make -C text-generation-inference/server gen-server
+	VERSION=${VERSION} make -C text-generation-inference/server gen-server
 
 tgi_test: tgi_server
 	python -m pip install .[neuronx]

diff --git a/benchmark/text-generation/llama2-7b.py b/benchmark/text-generation/llama2-7b.py
diff --git a/benchmark/text-generation/llama2-13b.py → benchmark/text-generation/mistral_small.py b/benchmark/text-generation/llama2-13b.py → benchmark/text-generation/mistral_small.py
@@ -8,16 +8,14 @@
 
 
 def main():
-    NUM_CORES = 8
+    NUM_CORES = 12
     num_cores = get_available_cores()
     if num_cores < NUM_CORES:
         raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.")
 
     model_configurations = {
-        "Llama-2-13B-BS1": ["meta-llama/Llama-2-13b-chat-hf", 1, 4096],
-        "Llama-2-13B-BS4": ["meta-llama/Llama-2-13b-chat-hf", 4, 4096],
-        "Llama-2-13B-BS8": ["meta-llama/Llama-2-13b-chat-hf", 8, 4096],
-        "Llama-2-13B-BS16": ["meta-llama/Llama-2-13b-chat-hf", 16, 4096],
+        "Mistral-Small-2409-BS1": ["mistralai/Mistral-Small-Instruct-2409", 1, 4096],
+        "Mistral-Small-2409-BS4": ["mistralai/Mistral-Small-Instruct-2409", 4, 4096],
     }
 
     for model_name, model_configuration in model_configurations.items():
@@ -27,7 +25,7 @@ def main():
             export=True,
             batch_size=batch_size,
             sequence_length=seq_length,
-            auto_cast_type="fp16",
+            auto_cast_type="bf16",
             num_cores=NUM_CORES,
         )
         with TemporaryDirectory() as tmpdir:

diff --git a/benchmark/text-generation/mistralv2.py b/benchmark/text-generation/mistralv2.py
diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/latency.png b/docs/assets/benchmarks/inferentia-llama2-13b/latency.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/throughput.png b/docs/assets/benchmarks/inferentia-llama2-13b/throughput.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-13b/ttft.png b/docs/assets/benchmarks/inferentia-llama2-13b/ttft.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/latency.png b/docs/assets/benchmarks/inferentia-llama2-7b/latency.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png b/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png b/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/latency.png b/docs/assets/benchmarks/inferentia-llama3-8b/latency.png
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png b/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png b/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png
diff --git a/docs/assets/benchmarks/inferentia-llama3.1-8b/latency.png b/docs/assets/benchmarks/inferentia-llama3.1-8b/latency.png
diff --git a/docs/assets/benchmarks/inferentia-llama3.1-8b/throughput.png b/docs/assets/benchmarks/inferentia-llama3.1-8b/throughput.png
diff --git a/docs/assets/benchmarks/inferentia-llama3.1-8b/ttft.png b/docs/assets/benchmarks/inferentia-llama3.1-8b/ttft.png
diff --git a/docs/assets/benchmarks/inferentia-mistral-small/latency.png b/docs/assets/benchmarks/inferentia-mistral-small/latency.png
diff --git a/docs/assets/benchmarks/inferentia-mistral-small/throughput.png b/docs/assets/benchmarks/inferentia-mistral-small/throughput.png
diff --git a/docs/assets/benchmarks/inferentia-mistral-small/ttft.png b/docs/assets/benchmarks/inferentia-mistral-small/ttft.png
diff --git a/docs/assets/benchmarks/inferentia-mistral-v2/latency.png b/docs/assets/benchmarks/inferentia-mistral-v2/latency.png
diff --git a/docs/assets/benchmarks/inferentia-mistral-v2/throughput.png b/docs/assets/benchmarks/inferentia-mistral-v2/throughput.png
diff --git a/docs/assets/benchmarks/inferentia-mistral-v2/ttft.png b/docs/assets/benchmarks/inferentia-mistral-v2/ttft.png
diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml
@@ -14,6 +14,8 @@
       title: Fine-tune BERT for Text Classification on AWS Trainium
     - local: training_tutorials/finetune_llm
       title: Fine-tune Llama 3 8B on AWS Trainium
+    - local: training_tutorials/sft_lora_finetune_llm
+      title: Fine-tune Llama 3 8B on with LoRA and the SFTTrainer
     title: Training Tutorials
   - sections:
     - local: inference_tutorials/notebooks
@@ -44,14 +46,10 @@
       title: NeuronX Text-generation-inference for AWS inferentia2
     title: How-To Guides
   - sections:
-    - local: benchmarks/inferentia-llama2-7b
-      title: Llama2 7b on AWS Inferentia2
-    - local: benchmarks/inferentia-llama2-13b
-      title: Llama2 13b on AWS Inferentia2
-    - local: benchmarks/inferentia-mistral-v2
-      title: Mistral v0.2 7b on AWS Inferentia2
-    - local: benchmarks/inferentia-llama3-8b
-      title: Llama-3 8B on AWS Inferentia2
+    - local: benchmarks/inferentia-mistral-small
+      title: Mistral Small on AWS Inferentia2
+    - local: benchmarks/inferentia-llama3.1-8b
+      title: Llama-3.1 8B on AWS Inferentia2
     title: Benchmarks
   - sections:
     - local: community/contributing