Use AWS Neuron SDK 2.19.1 (#661)

* chore: AWS Neuron SDK 2.19.1 * test(decoder): change llama model There is an issue with small llama models producing gibberish output with AWS Neuron SDK 2.19. * tests(decoder): add mixtral config * fix(Mixtral): disable continuous batching There is a regression in AWS Neuron SDK 2.19, so we need to stick to static batching. * fix: nit in description * chore(tgi): use AWS Neuron SDK 2.19.1 * test(tgi): adapt sample generations expectations * feat(tgi): change llama model and update expectations * docs: update Llama3-8-B benchmark numbers * perf(llama7b): use 12 cores and update numbers * perf(tgi): add llama3 and use dp2 for llama2 7b Each model now has its own docker-compose.yaml * perf(tgi): update Llama-2-7b bench results * perf(tgi): add Llama3 results * perf(tgi): update llama3-70b 48xlarge results * style: new ruff rules for type comparison * test(decoder): refactor pipelines tests
huggingface · Jul 22, 2024 · 278d76c · 278d76c
1 parent 2430f9f
commit 278d76c
Show file tree

Hide file tree

Showing 33 changed files with 248 additions and 105 deletions.
diff --git a/benchmark/text-generation-inference/README.md b/benchmark/text-generation-inference/README.md
@@ -2,7 +2,7 @@
 
 ## Local environment setup
 
-These configurations are tested and run on an inf2.48xlarge with the Hugging Face Deep Learning AMI from the AWS Marketplace.  
+These configurations are tested and run on an inf2.48xlarge with the Hugging Face Deep Learning AMI from the AWS Marketplace.
 
 Copy the configurations down using
 
@@ -44,16 +44,8 @@ Alternatively, you can edit the appropriate docker-compose.yaml to supply the fu
 
 ## Start the servers
 
-For smaller models, you can use the multi-server configuration with a load balancer:
-
-```shell
-$ docker compose --env-file llama-7b/.env up
-```
-
-For larger models, use their specific docker files:
-
 ```shell
-$ docker compose -f llama3-70b/docker-compose.yaml --env-file llama3-70b/.env up
+$ docker compose -f llama3-8b/docker-compose.yaml --env-file llama3-8b/.env up
 ```
 
 Note: edit the .env file to change the model configuration
@@ -87,7 +79,7 @@ $ ./run_all.sh NousResearch/Meta-Llama-3-70B-Instruct
 
 ### Compiling the model
 
-If you are trying to run a configuration or a model that is not available in the cache, you can compile the model before you run it, then load it locally. 
+If you are trying to run a configuration or a model that is not available in the cache, you can compile the model before you run it, then load it locally.
 
 See the [llama3-70b-trn1.32xlarge](llama3-70b-trn1.32xlarge) as an example.
 

diff --git a/benchmark/text-generation-inference/llama-7b/.env b/benchmark/text-generation-inference/llama-7b/.env
@@ -1,5 +1,5 @@
 MODEL_ID='NousResearch/Llama-2-7b-chat-hf'
 HF_AUTO_CAST_TYPE='fp16'
-MAX_BATCH_SIZE=32
+MAX_BATCH_SIZE=24
 MAX_INPUT_LENGTH=3072
 MAX_TOTAL_TOKENS=4096
diff --git a/benchmark/text-generation-inference/llama-7b/docker-compose.yaml b/benchmark/text-generation-inference/llama-7b/docker-compose.yaml
@@ -0,0 +1,57 @@
+version: '3.7'
+
+services:
+  tgi-1:
+    image: neuronx-tgi:latest
+    ports:
+      - "8081:8081"
+    environment:
+      - PORT=8081
+      - MODEL_ID=${MODEL_ID}
+      - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
+      - HF_NUM_CORES=12
+      - MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
+      - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
+      - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
+      - MAX_CONCURRENT_REQUESTS=512
+    devices:
+      - "/dev/neuron0"
+      - "/dev/neuron1"
+      - "/dev/neuron2"
+      - "/dev/neuron3"
+      - "/dev/neuron4"
+      - "/dev/neuron5"
+
+  tgi-2:
+    image: neuronx-tgi:latest
+    ports:
+      - "8082:8082"
+    environment:
+      - PORT=8082
+      - MODEL_ID=${MODEL_ID}
+      - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
+      - HF_NUM_CORES=12
+      - MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
+      - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
+      - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
+      - MAX_CONCURRENT_REQUESTS=512
+    devices:
+      - "/dev/neuron6"
+      - "/dev/neuron7"
+      - "/dev/neuron8"
+      - "/dev/neuron9"
+      - "/dev/neuron10"
+      - "/dev/neuron11"
+
+  loadbalancer:
+    image: nginx:alpine
+    ports:
+      - "8080:80"
+    volumes:
+      - ./nginx.conf:/etc/nginx/nginx.conf:ro
+    depends_on:
+      - tgi-1
+      - tgi-2
+    deploy:
+      placement:
+        constraints: [node.role == manager]
diff --git a/benchmark/text-generation-inference/llama-7b/nginx.conf b/benchmark/text-generation-inference/llama-7b/nginx.conf
@@ -0,0 +1,14 @@
+### Nginx TGI Load Balancer
+events {}
+http {
+    upstream tgicluster {
+        server tgi-1:8081;
+        server tgi-2:8082;
+    }
+    server {
+        listen 80;
+        location / {
+            proxy_pass http://tgicluster;
+        }
+    }
+}
diff --git a/benchmark/text-generation-inference/llama-7b/tgi-results.csv b/benchmark/text-generation-inference/llama-7b/tgi-results.csv
@@ -1,11 +1,13 @@
 model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
-huggingface/NousResearch/Llama-2-7b-chat-hf,1,13.811941495564616,0.3781782309997652,71.37198062194233
-huggingface/NousResearch/Llama-2-7b-chat-hf,2,23.461539426271507,0.3602376449998701,71.70553820509232
-huggingface/NousResearch/Llama-2-7b-chat-hf,4,45.45448705790145,0.3612828944997091,73.58663426819392
-huggingface/NousResearch/Llama-2-7b-chat-hf,8,71.13444471932405,0.3752646894999998,74.85884378373552
-huggingface/NousResearch/Llama-2-7b-chat-hf,16,138.54599491404485,0.6447374934998606,81.11484812939682
-huggingface/NousResearch/Llama-2-7b-chat-hf,32,247.32811870027916,1.0393478490004782,85.0958261705239
-huggingface/NousResearch/Llama-2-7b-chat-hf,64,391.3595246354876,2.2831421710016,99.36474989676213
-huggingface/NousResearch/Llama-2-7b-chat-hf,128,464.82600069905294,3.342431744500118,120.29151899306808
-huggingface/NousResearch/Llama-2-7b-chat-hf,256,526.7164477974997,6.532527566999306,160.52458146930456
-huggingface/NousResearch/Llama-2-7b-chat-hf,512,506.7975712115936,27.33909000099993,260.14547684970137
+huggingface/NousResearch/Llama-2-7b-chat-hf,1,32.32303647791415,0.4092339959997844,32.98457994546189
+huggingface/NousResearch/Llama-2-7b-chat-hf,8,280.2455817454919,0.4103973410001345,18.824823855788903
+huggingface/NousResearch/Llama-2-7b-chat-hf,16,606.2237208004269,0.42390128999977605,19.73879150452322
+huggingface/NousResearch/Llama-2-7b-chat-hf,24,778.5847225896651,0.44628154350084515,21.729555672304947
+huggingface/NousResearch/Llama-2-7b-chat-hf,32,660.0774421854719,0.6625862749997395,40.97050480951723
+huggingface/NousResearch/Llama-2-7b-chat-hf,50,809.3513111702051,1.1112228684996808,32.355166522075024
+huggingface/NousResearch/Llama-2-7b-chat-hf,64,902.2019208540152,1.518584174499665,34.52519498921747
+huggingface/NousResearch/Llama-2-7b-chat-hf,96,1000.426066970307,2.581174633500268,44.06432188527795
+huggingface/NousResearch/Llama-2-7b-chat-hf,100,965.894643860531,5.110174397500032,44.77109148855796
+huggingface/NousResearch/Llama-2-7b-chat-hf,128,1070.1775339135268,7.600166947499474,49.67094403911358
+huggingface/NousResearch/Llama-2-7b-chat-hf,150,1059.6704842739082,8.564977125500263,55.47304516981905
+huggingface/NousResearch/Llama-2-7b-chat-hf,200,1138.036018763616,13.465086967999923,64.28243222745746
diff --git a/benchmark/text-generation-inference/llama3-70b-inf2.48xlarge/tgi-results.csv b/benchmark/text-generation-inference/llama3-70b-inf2.48xlarge/tgi-results.csv
@@ -1,11 +1,4 @@
 model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,1,18.818667211424472,1.3884793975012144,51.46871325828836
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,2,32.22257477833452,2.0121661404991755,56.734265583687296
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,4,50.19917175671667,5.205651430500438,66.04042245148653
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,8,52.13272738944358,9.568476632499369,97.32615035298838
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,16,53.59997031445967,26.087651531999654,191.19227161475598
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,32,56.08684244759754,61.25285707449984,310.16900484570965
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,64,57.40338464731561,129.3146581359997,560.2474255463762
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,128,58.39025853766574,267.3882590960002,1094.9986170264501
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,256,58.589480601098536,541.6153878579971,2147.5413489446523
-huggingface/NousResearch/Meta-Llama-3-70B-Instruct,512,58.69645477077839,1085.1772966810022,4231.7554182432905
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,1,22.4846564103628,1.2006561384987435,43.7079989917263
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,8,64.09420641185218,4.925495064999268,85.36754380113435
+huggingface/NousResearch/Meta-Llama-3-70B-Instruct,16,66.62450133873871,22.87049979600488,135.4123021951354
diff --git a/benchmark/text-generation-inference/llama3-8b/.env b/benchmark/text-generation-inference/llama3-8b/.env
@@ -0,0 +1,5 @@
+MODEL_ID='NousResearch/Meta-Llama-3-8B-Instruct'
+HF_AUTO_CAST_TYPE='fp16'
+MAX_BATCH_SIZE=32
+MAX_INPUT_LENGTH=4000
+MAX_TOTAL_TOKENS=4096
diff --git a/...-generation-inference/docker-compose.yaml → ...n-inference/llama3-8b/docker-compose.yaml b/...-generation-inference/docker-compose.yaml → ...n-inference/llama3-8b/docker-compose.yaml
diff --git a/...mark/text-generation-inference/nginx.conf → ...generation-inference/llama3-8b/nginx.conf b/...mark/text-generation-inference/nginx.conf → ...generation-inference/llama3-8b/nginx.conf
diff --git a/benchmark/text-generation-inference/llama3-8b/tgi-results.csv b/benchmark/text-generation-inference/llama3-8b/tgi-results.csv
@@ -0,0 +1,13 @@
+model_id,concurrent requests,throughput (t/s),Time-to-first-token @ P50 (s),average latency (ms)
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,1,44.72947298811359,0.2930618720001803,21.387192412995546
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,8,254.26638394677616,0.3072573690005811,24.51789344634094
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,16,396.49578354796415,0.31329568949968234,29.41915454622028
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,24,458.9461712504898,0.31723227349903027,36.45821381291491
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,32,540.3852559365118,0.31949053349944734,39.548380672987705
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,50,648.5983772802653,0.6981559694995667,47.64409672960739
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,64,729.1746189461367,0.8981061290014623,51.60748655120524
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,96,823.7525735951876,1.1334064394995949,60.62761554646364
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,100,829.9821677199822,1.2774171685014153,63.72698943226848
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,128,838.2579077776568,1.5125607664995186,74.21572967679927
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,150,849.2518611727152,1.7492157529995893,80.71755481115194
+huggingface/NousResearch/Meta-Llama-3-8B-Instruct,200,864.4918328198065,2.4135443449995364,98.63394209087461
diff --git a/benchmark/text-generation-inference/mistral-7b/docker-compose.yaml b/benchmark/text-generation-inference/mistral-7b/docker-compose.yaml
@@ -0,0 +1,73 @@
+version: '3.7'
+
+services:
+  tgi-1:
+    image: neuronx-tgi:latest
+    ports:
+      - "8081:8081"
+    environment:
+      - PORT=8081
+      - MODEL_ID=${MODEL_ID}
+      - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
+      - HF_NUM_CORES=8
+      - MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
+      - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
+      - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
+      - MAX_CONCURRENT_REQUESTS=512
+    devices:
+      - "/dev/neuron0"
+      - "/dev/neuron1"
+      - "/dev/neuron2"
+      - "/dev/neuron3"
+
+  tgi-2:
+    image: neuronx-tgi:latest
+    ports:
+      - "8082:8082"
+    environment:
+      - PORT=8082
+      - MODEL_ID=${MODEL_ID}
+      - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
+      - HF_NUM_CORES=8
+      - MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
+      - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
+      - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
+      - MAX_CONCURRENT_REQUESTS=512
+    devices:
+      - "/dev/neuron4"
+      - "/dev/neuron5"
+      - "/dev/neuron6"
+      - "/dev/neuron7"
+
+  tgi-3:
+    image: neuronx-tgi:latest
+    ports:
+      - "8083:8083"
+    environment:
+      - PORT=8083
+      - MODEL_ID=${MODEL_ID}
+      - HF_AUTO_CAST_TYPE=${HF_AUTO_CAST_TYPE}
+      - HF_NUM_CORES=8
+      - MAX_BATCH_SIZE=${MAX_BATCH_SIZE}
+      - MAX_INPUT_LENGTH=${MAX_INPUT_LENGTH}
+      - MAX_TOTAL_TOKENS=${MAX_TOTAL_TOKENS}
+      - MAX_CONCURRENT_REQUESTS=512
+    devices:
+      - "/dev/neuron8"
+      - "/dev/neuron9"
+      - "/dev/neuron10"
+      - "/dev/neuron11"
+
+  loadbalancer:
+    image: nginx:alpine
+    ports:
+      - "8080:80"
+    volumes:
+      - ./nginx.conf:/etc/nginx/nginx.conf:ro
+    depends_on:
+      - tgi-1
+      - tgi-2
+      - tgi-3
+    deploy:
+      placement:
+        constraints: [node.role == manager]
diff --git a/benchmark/text-generation-inference/mistral-7b/nginx.conf b/benchmark/text-generation-inference/mistral-7b/nginx.conf
@@ -0,0 +1,15 @@
+### Nginx TGI Load Balancer
+events {}
+http {
+    upstream tgicluster {
+        server tgi-1:8081;
+        server tgi-2:8082;
+        server tgi-3:8083;
+    }
+    server {
+        listen 80;
+        location / {
+            proxy_pass http://tgicluster;
+        }
+    }
+}
diff --git a/benchmark/text-generation/llama2-7b.py b/benchmark/text-generation/llama2-7b.py
@@ -8,7 +8,7 @@
 
 
 def main():
-    NUM_CORES = 8
+    NUM_CORES = 12
     num_cores = get_available_cores()
     if num_cores < NUM_CORES:
         raise ValueError(f"This benchmark can only run on an instance with at least {NUM_CORES} cores.")
@@ -18,7 +18,7 @@ def main():
         "Llama-2-7B-BS4": ["meta-llama/Llama-2-7b-chat-hf", 4, 4096],
         "Llama-2-7B-BS8": ["meta-llama/Llama-2-7b-chat-hf", 8, 4096],
         "Llama-2-7B-BS16": ["meta-llama/Llama-2-7b-chat-hf", 16, 4096],
-        "Llama-2-7B-BS32": ["meta-llama/Llama-2-7b-chat-hf", 32, 4096],
+        "Llama-2-7B-BS24": ["meta-llama/Llama-2-7b-chat-hf", 24, 4096],
     }
 
     for model_name, model_configuration in model_configurations.items():

diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/latency.png b/docs/assets/benchmarks/inferentia-llama2-7b/latency.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png b/docs/assets/benchmarks/inferentia-llama2-7b/throughput.png
diff --git a/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png b/docs/assets/benchmarks/inferentia-llama2-7b/ttft.png
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/latency.png b/docs/assets/benchmarks/inferentia-llama3-8b/latency.png
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png b/docs/assets/benchmarks/inferentia-llama3-8b/throughput.png
diff --git a/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png b/docs/assets/benchmarks/inferentia-llama3-8b/ttft.png
diff --git a/docs/source/benchmarks/inferentia-llama2-7b.mdx b/docs/source/benchmarks/inferentia-llama2-7b.mdx
@@ -26,9 +26,9 @@ For this benchmark we will use the following configurations:
 | Llama2 7B BS4  | 4          | 4096            |
 | Llama2 7B BS8  | 8          | 4096            |
 | Llama2 7B BS16 | 16         | 4096            |
-| Llama2 7B BS32 | 32         | 4096            |
+| Llama2 7B BS32 | 24         | 4096            |
 
-*Note: all models are compiled to use 4 devices corresponding to 8 cores on the `inf2.48xlarge` instance.*
+*Note: all models are compiled to use 6 devices corresponding to 12 cores on the `inf2.48xlarge` instance.*
 
 *Note: please refer to the [inferentia2 product page](https://aws.amazon.com/ec2/instance-types/inf2/) for details on the available instances.*
 

diff --git a/optimum/exporters/neuron/model_configs.py b/optimum/exporters/neuron/model_configs.py
@@ -869,4 +869,4 @@ class MistralNeuronConfig(TextNeuronDecoderConfig):
 @register_in_tasks_manager("mixtral", "text-generation")
 class MixtralNeuronConfig(TextNeuronDecoderConfig):
     NEURONX_CLASS = "mixtral.model.MixtralForSampling"
-    CONTINUOUS_BATCHING = True
+    CONTINUOUS_BATCHING = False
diff --git a/optimum/neuron/distributed/checkpointing.py b/optimum/neuron/distributed/checkpointing.py
@@ -65,7 +65,7 @@ def convert_fn(tensors):
         return rewritten_tensors
 
     def select_fn(v):
-        return type(v) == xser.TensorReference
+        return type(v) is xser.TensorReference
 
     return xm.ToXlaTensorArena(convert_fn, select_fn).transform(ref_data)
 

diff --git a/optimum/neuron/generation/utils.py b/optimum/neuron/generation/utils.py
@@ -1628,7 +1628,7 @@ def beam_search(
         )
 
         for k, v in sequence_outputs.items():
-            if type(v) == torch.Tensor:
+            if type(v) is torch.Tensor:
                 sequence_outputs[k] = sequence_outputs[k].to(input_ids.device)
 
         if return_dict_in_generate:

diff --git a/optimum/neuron/version.py b/optimum/neuron/version.py
@@ -14,4 +14,4 @@
 
 __version__ = "0.0.24.dev0"
 
-__sdk_version__ = "2.18.0"
+__sdk_version__ = "2.19.1"
diff --git a/setup.py b/setup.py
@@ -61,12 +61,12 @@
     ],
     "neuronx": [
         "wheel",
-        "neuronx-cc==2.13.66.0",
-        "torch-neuronx==2.1.2.2.1.0",
-        "transformers-neuronx==0.10.0.21",
+        "neuronx-cc==2.14.227.0",
+        "torch-neuronx==2.1.2.2.2.0",
+        "transformers-neuronx==0.11.351",
         "torch==2.1.2.*",
         "torchvision==0.16.*",
-        "neuronx_distributed==0.7.0",
+        "neuronx_distributed==0.8.0",
     ],
     "diffusers": ["diffusers>=0.28.0, <0.29.0", "peft"],
     "sentence-transformers": ["sentence-transformers >= 2.2.0"],
@@ -77,7 +77,7 @@
     version=__version__,
     description=(
         "Optimum Neuron is the interface between the Hugging Face Transformers and Diffusers libraries and AWS "
-        "Tranium and Inferentia accelerators. It provides a set of tools enabling easy model loading, training and "
+        "Trainium and Inferentia accelerators. It provides a set of tools enabling easy model loading, training and "
         "inference on single and multiple neuron core settings for different downstream tasks."
     ),
     long_description=open("README.md", "r", encoding="utf-8").read(),

diff --git a/tests/decoder/conftest.py b/tests/decoder/conftest.py
@@ -30,13 +30,17 @@
         "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
     },
     "llama": {
-        "model_id": "princeton-nlp/Sheared-LLaMA-1.3B",
+        "model_id": "NousResearch/Hermes-2-Theta-Llama-3-8B",
         "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
     },
     "mistral": {
         "model_id": "optimum/mistral-1.1b-testing",
         "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
     },
+    "mixtral": {
+        "model_id": "dacorvo/Mixtral-tiny",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
+    },
 }
Original file line number	Diff line number	Diff line change
Expand Up		@@ -14,4 +14,4 @@

		__version__ = "0.0.24.dev0"

		__sdk_version__ = "2.18.0"
		__sdk_version__ = "2.19.1"