From 09cc51bd84466adedb232f5b56ec38ca080823d8 Mon Sep 17 00:00:00 2001
From: David Corvoysier <david@huggingface.co>
Date: Wed, 26 Jun 2024 14:14:31 +0200
Subject: [PATCH] Refactor decoder tests (#641)

* test: remove staging_test fixture

* test(decoder): regroup decoder generation tests

* fix(decoder): save checkpoint after export from local

* test: regroup decoder tests

* fix(tgi): temporarily pin ravif version

* ci: run decoder tests
---
 .github/workflows/test_inf2.yml               |   6 +-
 optimum/neuron/modeling_decoder.py            |  16 +--
 tests/conftest.py                             |  35 -----
 tests/decoder/conftest.py                     | 122 ++++++++++++++++++
 tests/decoder/test_decoder_export.py          |  87 +++++++++++++
 .../test_decoder_generation.py}               |  76 +++++++++--
 tests/decoder/test_decoder_hub.py             |  65 ++++++++++
 .../test_fused_logits_warper.py               |   0
 tests/generation/conftest.py                  |  37 +-----
 tests/generation/test_export.py               |  40 +-----
 tests/generation/test_hub.py                  |  32 +----
 tests/generation/test_tnx_generate.py         | 105 ---------------
 text-generation-inference/Dockerfile          |   2 +
 13 files changed, 357 insertions(+), 266 deletions(-)
 create mode 100644 tests/decoder/conftest.py
 create mode 100644 tests/decoder/test_decoder_export.py
 rename tests/{generation/test_tnx_llama.py => decoder/test_decoder_generation.py} (59%)
 create mode 100644 tests/decoder/test_decoder_hub.py
 rename tests/{generation => decoder}/test_fused_logits_warper.py (100%)
 delete mode 100644 tests/generation/test_tnx_generate.py

diff --git a/.github/workflows/test_inf2.yml b/.github/workflows/test_inf2.yml
index 567aa8980..c709a30df 100644
--- a/.github/workflows/test_inf2.yml
+++ b/.github/workflows/test_inf2.yml
@@ -52,7 +52,11 @@ jobs:
         run: |
           source aws_neuron_venv_pytorch/bin/activate
           HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/cli
-      - name: Run generation tests
+      - name: Run decoder tests
+        run: |
+          source aws_neuron_venv_pytorch/bin/activate
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/decoder
+      - name: Run other generation tests
         run: |
           source aws_neuron_venv_pytorch/bin/activate
           HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} pytest -m is_inferentia_test tests/generation
diff --git a/optimum/neuron/modeling_decoder.py b/optimum/neuron/modeling_decoder.py
index 44d434dab..a2efea9bf 100644
--- a/optimum/neuron/modeling_decoder.py
+++ b/optimum/neuron/modeling_decoder.py
@@ -445,21 +445,19 @@ def forward(self, *args, **kwargs):
     def _save_pretrained(self, save_directory: Union[str, Path]):
         dst_checkpoint_path, dst_compiled_path = self._get_neuron_dirs(save_directory)
 
-        def copy_dir_to_path(src_dir: Union[str, Path, TemporaryDirectory], dst_path: Union[str, Path]):
-            if isinstance(src_dir, TemporaryDirectory):
-                shutil.copytree(src_dir.name, dst_path, dirs_exist_ok=True)
-            elif not os.path.samefile(src_dir, dst_path):
-                os.symlink(dst_path, src_dir)
-
-        # Copy checkpoint directory (it always exists)
-        copy_dir_to_path(self.checkpoint_dir, dst_checkpoint_path)
+        neuron_config = getattr(self.config, "neuron")
+        checkpoint_id = neuron_config.get("checkpoint_id", None)
+        if checkpoint_id is None:
+            # Model was exported from a local path, so we need to save the checkpoint
+            shutil.copytree(self.checkpoint_dir, dst_checkpoint_path, dirs_exist_ok=True)
         self.checkpoint_dir = dst_checkpoint_path
+
         # Save or create compiled directory
         if self.compiled_dir is None:
             # The compilation artifacts have never been saved, do it now
             self.model.save(dst_compiled_path)
         else:
-            copy_dir_to_path(self.compiled_dir, dst_compiled_path)
+            shutil.copytree(self.compiled_dir, dst_compiled_path)
         self.compiled_dir = dst_compiled_path
         self.generation_config.save_pretrained(save_directory)
 
diff --git a/tests/conftest.py b/tests/conftest.py
index ed3166ea4..a681ed087 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -25,7 +25,6 @@
     set_custom_cache_repo_name_in_hf_home,
     set_neuron_cache_path,
 )
-from optimum.utils.testing_utils import TOKEN, USER
 
 from .utils import OPTIMUM_INTERNAL_TESTING_CACHE_REPO, get_random_string
 
@@ -88,40 +87,6 @@ def inf_diffuser_model(request):
     return request.param
 
 
-@pytest.fixture(scope="module")
-def staging_test():
-    custom_cache_repo_name = "optimum-neuron-cache-testing"
-    custom_cache_repo = f"{USER}/{custom_cache_repo_name}"
-    custom_private_cache_repo = f"{custom_cache_repo}-private"
-
-    orig_token = get_token()
-    orig_custom_cache_repo = load_custom_cache_repo_name_from_hf_home()
-
-    seed = get_random_string(5)
-    custom_cache_repo_with_seed = f"{custom_cache_repo}-{seed}"
-    custom_private_cache_repo_with_seed = f"{custom_private_cache_repo}-{seed}"
-
-    login(token=TOKEN)
-    # We do not set which cache repo to use because there are two, it is up to the test to define that.
-
-    create_repo(custom_cache_repo_with_seed, repo_type="model", exist_ok=True)
-    create_repo(custom_private_cache_repo_with_seed, repo_type="model", exist_ok=True, private=True)
-
-    yield
-
-    delete_repo(custom_cache_repo_with_seed, repo_type="model")
-    delete_repo(custom_private_cache_repo_with_seed, repo_type="model")
-
-    if orig_token is not None:
-        login(token=orig_token)
-    else:
-        logout()
-    if orig_custom_cache_repo is not None:
-        set_custom_cache_repo_name_in_hf_home(orig_custom_cache_repo, check_repo=False)
-    else:
-        delete_custom_cache_repo_name_from_hf_home()
-
-
 def _hub_test(create_local_cache: bool = False):
     orig_token = get_token()
     orig_custom_cache_repo = load_custom_cache_repo_name_from_hf_home()
diff --git a/tests/decoder/conftest.py b/tests/decoder/conftest.py
new file mode 100644
index 000000000..1b02a20de
--- /dev/null
+++ b/tests/decoder/conftest.py
@@ -0,0 +1,122 @@
+import copy
+import logging
+import subprocess
+import sys
+from tempfile import TemporaryDirectory
+
+import huggingface_hub
+import pytest
+from transformers import AutoTokenizer
+
+from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron.utils import synchronize_hub_cache
+from optimum.neuron.version import __sdk_version__ as sdk_version
+from optimum.neuron.version import __version__ as version
+
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="[%(asctime)s] %(levelname)s [%(filename)s.%(funcName)s:%(lineno)d] %(message)s",
+    stream=sys.stdout,
+)
+logger = logging.getLogger(__file__)
+
+OPTIMUM_CACHE_REPO_ID = "optimum-internal-testing/neuron-testing-cache"
+
+# All model configurations below will be added to the neuron_model_config fixture
+DECODER_MODEL_CONFIGURATIONS = {
+    "gpt2": {
+        "model_id": "gpt2",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 1024, "num_cores": 2, "auto_cast_type": "fp16"},
+    },
+    "llama": {
+        "model_id": "princeton-nlp/Sheared-LLaMA-1.3B",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "fp16"},
+    },
+    "mistral": {
+        "model_id": "optimum/mistral-1.1b-testing",
+        "export_kwargs": {"batch_size": 4, "sequence_length": 4096, "num_cores": 2, "auto_cast_type": "bf16"},
+    },
+}
+
+
+def _get_hub_neuron_model_id(config_name: str):
+    return f"optimum-internal-testing/neuron-testing-{version}-{sdk_version}-{config_name}"
+
+
+def _export_model(model_id, export_kwargs, neuron_model_path):
+    export_command = ["optimum-cli", "export", "neuron", "-m", model_id, "--task", "text-generation"]
+    for kwarg, value in export_kwargs.items():
+        export_command.append(f"--{kwarg}")
+        export_command.append(str(value))
+    export_command.append(neuron_model_path)
+    logger.info(f"Exporting {model_id} with {export_kwargs}")
+    try:
+        subprocess.run(export_command, check=True)
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Failed to export model: {e}")
+        return
+
+
+@pytest.fixture(scope="session", params=DECODER_MODEL_CONFIGURATIONS.keys())
+def neuron_decoder_config(request):
+    """Expose a pre-trained neuron decoder model
+
+    The fixture first makes sure the following model artifacts are present on the hub:
+    - exported neuron model under optimum-internal-testing/neuron-testing-<version>-<name>,
+    - cached artifacts under optimum-internal-testing/neuron-testing-cache.
+    If not, it will export the model and push it to the hub.
+
+    It then fetches the model locally and return a dictionary containing:
+    - a configuration name,
+    - the original model id,
+    - the export parameters,
+    - the neuron model id,
+    - the neuron model local path.
+
+    For each exposed model, the local directory is maintained for the duration of the
+    test session and cleaned up afterwards.
+    The hub model artifacts are never cleaned up and persist accross sessions.
+    They must be cleaned up manually when the optimum-neuron version changes.
+
+    """
+    config_name = request.param
+    model_config = copy.deepcopy(DECODER_MODEL_CONFIGURATIONS[request.param])
+    model_id = model_config["model_id"]
+    export_kwargs = model_config["export_kwargs"]
+    neuron_model_id = _get_hub_neuron_model_id(config_name)
+    with TemporaryDirectory() as neuron_model_path:
+        hub = huggingface_hub.HfApi()
+        if hub.repo_exists(neuron_model_id):
+            logger.info(f"Fetching {neuron_model_id} from the HuggingFace hub")
+            hub.snapshot_download(neuron_model_id, local_dir=neuron_model_path)
+        else:
+            _export_model(model_id, export_kwargs, neuron_model_path)
+            tokenizer = AutoTokenizer.from_pretrained(model_id)
+            tokenizer.save_pretrained(neuron_model_path)
+            del tokenizer
+            # Create the test model on the hub
+            hub.create_repo(neuron_model_id, private=True)
+            hub.upload_folder(
+                folder_path=neuron_model_path,
+                repo_id=neuron_model_id,
+                ignore_patterns=[NeuronModelForCausalLM.CHECKPOINT_DIR + "/*"],
+            )
+            # Make sure it is cached
+            synchronize_hub_cache(cache_repo_id=OPTIMUM_CACHE_REPO_ID)
+        # Add dynamic parameters to the model configuration
+        model_config["neuron_model_path"] = neuron_model_path
+        model_config["neuron_model_id"] = neuron_model_id
+        # Also add model configuration name to allow tests to adapt their expectations
+        model_config["name"] = config_name
+        # Yield instead of returning to keep a reference to the temporary directory.
+        # It will go out of scope and be released only once all tests needing the fixture
+        # have been completed.
+        logger.info(f"{config_name} ready for testing ...")
+        yield model_config
+        logger.info(f"Done with {config_name}")
+
+
+@pytest.fixture(scope="module")
+def neuron_decoder_path(neuron_decoder_config):
+    yield neuron_decoder_config["neuron_model_path"]
diff --git a/tests/decoder/test_decoder_export.py b/tests/decoder/test_decoder_export.py
new file mode 100644
index 000000000..4aaed4a20
--- /dev/null
+++ b/tests/decoder/test_decoder_export.py
@@ -0,0 +1,87 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from tempfile import TemporaryDirectory
+
+import pytest
+from transformers import AutoModelForCausalLM
+
+from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
+
+
+DECODER_MODEL_ARCHITECTURES = ["bloom", "gpt2", "llama", "mistral", "mixtral", "opt"]
+DECODER_MODEL_NAMES = {
+    "bloom": "hf-internal-testing/tiny-random-BloomForCausalLM",
+    "gpt2": "hf-internal-testing/tiny-random-gpt2",
+    "llama": "dacorvo/tiny-random-llama",
+    "mistral": "dacorvo/tiny-random-MistralForCausalLM",
+    "mixtral": "dacorvo/Mixtral-tiny",
+    "opt": "hf-internal-testing/tiny-random-OPTForCausalLM",
+}
+
+
+@pytest.fixture(
+    scope="session", params=[DECODER_MODEL_NAMES[model_arch] for model_arch in DECODER_MODEL_ARCHITECTURES]
+)
+def export_decoder_id(request):
+    return request.param
+
+
+def check_neuron_model(neuron_model, batch_size=None, sequence_length=None, num_cores=None, auto_cast_type=None):
+    neuron_config = getattr(neuron_model.config, "neuron", None)
+    assert neuron_config
+    if batch_size:
+        assert neuron_config["batch_size"] == batch_size
+    if sequence_length:
+        assert neuron_config["sequence_length"] == sequence_length
+    if num_cores:
+        assert neuron_config["num_cores"] == num_cores
+    if auto_cast_type:
+        assert neuron_config["auto_cast_type"] == auto_cast_type
+
+
+@pytest.mark.parametrize(
+    "batch_size, sequence_length, num_cores, auto_cast_type",
+    [
+        [1, 100, 2, "fp32"],
+        [1, 100, 2, "fp16"],
+        [2, 100, 2, "fp16"],
+    ],
+)
+@is_inferentia_test
+@requires_neuronx
+@pytest.mark.parametrize("local", [True, False], ids=["local", "from_hub"])
+def test_decoder_export_save_reload(local, export_decoder_id, batch_size, sequence_length, num_cores, auto_cast_type):
+    export_kwargs = {
+        "batch_size": batch_size,
+        "sequence_length": sequence_length,
+        "num_cores": num_cores,
+        "auto_cast_type": auto_cast_type,
+    }
+    with TemporaryDirectory() as model_path:
+        if local:
+            with TemporaryDirectory() as tmpdir:
+                model = AutoModelForCausalLM.from_pretrained(export_decoder_id)
+                model.save_pretrained(tmpdir)
+                model = NeuronModelForCausalLM.from_pretrained(tmpdir, export=True, **export_kwargs)
+                model.save_pretrained(model_path)
+        else:
+            model = NeuronModelForCausalLM.from_pretrained(export_decoder_id, export=True, **export_kwargs)
+            model.save_pretrained(model_path)
+        check_neuron_model(model, **export_kwargs)
+        del model
+        model = NeuronModelForCausalLM.from_pretrained(model_path)
+        check_neuron_model(model, **export_kwargs)
diff --git a/tests/generation/test_tnx_llama.py b/tests/decoder/test_decoder_generation.py
similarity index 59%
rename from tests/generation/test_tnx_llama.py
rename to tests/decoder/test_decoder_generation.py
index 3876e63f2..c2e67707f 100644
--- a/tests/generation/test_tnx_llama.py
+++ b/tests/decoder/test_decoder_generation.py
@@ -18,24 +18,78 @@
 import pytest
 import torch
 from transformers import AutoTokenizer
+from transformers.generation import StoppingCriteria
 
 from optimum.neuron import NeuronModelForCausalLM
 from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
 
 
 @pytest.fixture(scope="module")
-def neuron_model_config():
-    model_id = "princeton-nlp/Sheared-LLaMA-1.3B"
-    model_kwargs = {"batch_size": 4, "sequence_length": 4096, "auto_cast_type": "f16", "num_cores": 2}
-    model = NeuronModelForCausalLM.from_pretrained(model_id, export=True, **model_kwargs)
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
+def model_and_tokenizer(neuron_decoder_path):
+    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
+    tokenizer = AutoTokenizer.from_pretrained(neuron_decoder_path)
     yield (model, tokenizer)
 
 
+def _test_generation(model, batch_size, input_length, **gen_kwargs):
+    input_ids = torch.ones((batch_size, input_length), dtype=torch.int64)
+    sample_output = model.generate(input_ids, **gen_kwargs)
+    assert sample_output.shape[0] == batch_size
+
+
+@pytest.mark.parametrize(
+    "gen_kwargs",
+    [
+        {"do_sample": True},
+        {"do_sample": True, "temperature": 0.7},
+        {"do_sample": False},
+        {"do_sample": False, "repetition_penalty": 1.2},
+    ],
+    ids=["sample", "sample-with-temp", "greedy", "greedy_no-repeat"],
+)
+@is_inferentia_test
+@requires_neuronx
+def test_decoder_generation_base(model_and_tokenizer, gen_kwargs):
+    model = model_and_tokenizer[0]
+    _test_generation(model, model.batch_size, 10, **gen_kwargs)
+
+
+@is_inferentia_test
+@requires_neuronx
+def test_decoder_generation_input_dimensions(model_and_tokenizer):
+    model, tokenizer = model_and_tokenizer
+    # Using valid input dimensions
+    _test_generation(model, model.batch_size, model.max_length // 2)
+    # Using an incompatible batch_size
+    with pytest.raises(ValueError, match="The specified batch_size"):
+        _test_generation(model, model.batch_size + 1, model.max_length)
+    # Using an incompatible input length
+    with pytest.raises(ValueError, match="The input sequence length"):
+        _test_generation(model, model.batch_size, input_length=model.max_length * 2)
+
+
+@is_inferentia_test
+@requires_neuronx
+def test_decoder_generation_custom_stopping_criteria(model_and_tokenizer):
+    model = model_and_tokenizer[0]
+
+    class CustomStoppingCriteria(StoppingCriteria):
+        def __init__(self):
+            self.called = False
+
+        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+            self.called = True
+            return True
+
+    criteria = CustomStoppingCriteria()
+    model.generate(input_ids=torch.ones([1, 10], dtype=torch.int64), stopping_criteria=[criteria])
+    assert criteria.called, "Custom StoppingCriteria should have been called"
+
+
 @is_inferentia_test
 @requires_neuronx
-def test_generation_llama_padded_inputs(neuron_model_config):
-    model, tokenizer = neuron_model_config
+def test_decoder_generation_padded_inputs(model_and_tokenizer):
+    model, tokenizer = model_and_tokenizer
     prompt = "One of my fondest memory is of my grandmother making homemade bread"
     first_input = tokenizer(prompt)
     first_ids = first_input["input_ids"]
@@ -56,8 +110,8 @@ def test_generation_llama_padded_inputs(neuron_model_config):
 
 @is_inferentia_test
 @requires_neuronx
-def test_decoder_generation_multiple_eos_token_ids(neuron_model_config):
-    model, tokenizer = neuron_model_config
+def test_decoder_generation_multiple_eos_token_ids(model_and_tokenizer):
+    model, tokenizer = model_and_tokenizer
     prompt = "Name three fruits:"
     tokens = tokenizer(prompt, return_tensors="pt")
     generation_config = copy.deepcopy(model.generation_config)
@@ -75,8 +129,8 @@ def test_decoder_generation_multiple_eos_token_ids(neuron_model_config):
 
 @is_inferentia_test
 @requires_neuronx
-def test_decoder_generation_stop_strings(neuron_model_config):
-    model, tokenizer = neuron_model_config
+def test_decoder_generation_stop_strings(model_and_tokenizer):
+    model, tokenizer = model_and_tokenizer
     prompt = "Name three fruits:"
     tokens = tokenizer(prompt, return_tensors="pt")
     generation_config = copy.deepcopy(model.generation_config)
diff --git a/tests/decoder/test_decoder_hub.py b/tests/decoder/test_decoder_hub.py
new file mode 100644
index 000000000..566d9659a
--- /dev/null
+++ b/tests/decoder/test_decoder_hub.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import re
+import socket
+from tempfile import TemporaryDirectory
+
+import pytest
+from huggingface_hub import HfApi, get_token
+from transformers import AutoModelForCausalLM
+
+from optimum.neuron import NeuronModelForCausalLM
+from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
+
+
+@is_inferentia_test
+@requires_neuronx
+@pytest.mark.parametrize("from_local", [False, True], ids=["from_hub", "from_local"])
+def test_decoder_push_to_hub(from_local):
+    model_id = "hf-internal-testing/tiny-random-gpt2"
+    with TemporaryDirectory() as model_path:
+        if from_local:
+            hub_model = AutoModelForCausalLM.from_pretrained(model_id)
+            with TemporaryDirectory() as tmpdir:
+                hub_model.save_pretrained(tmpdir)
+                model = NeuronModelForCausalLM.from_pretrained(tmpdir, export=True)
+                # Save must happen within the context of the tmpdir or checkpoint dir is lost
+                model.save_pretrained(model_path)
+        else:
+            model = NeuronModelForCausalLM.from_pretrained(model_id, export=True)
+            model.save_pretrained(model_path)
+        # The hub model contains the checkpoint only when the model is exported from a local path
+        ignore_patterns = [] if from_local else [model.CHECKPOINT_DIR + "/*"]
+        hostname = socket.gethostname()
+        model_name = f"neuron-testing-{hostname}-decoder-push"
+        model_name += "-from-local" if from_local else "-from-hub"
+        repo_id = f"optimum-internal-testing/{model_name}"
+        model.push_to_hub(model_path, repo_id, use_auth_token=get_token())
+        api = HfApi()
+        try:
+            hub_files_path = api.list_repo_files(repo_id)
+            for path, _, files in os.walk(model_path):
+                for name in files:
+                    local_file_path = os.path.join(path, name)
+                    hub_file_path = os.path.relpath(local_file_path, model_path)
+                    excluded = False
+                    for pattern in ignore_patterns:
+                        if re.compile(pattern).match(hub_file_path) is not None:
+                            excluded = True
+                            break
+                    assert excluded or hub_file_path in hub_files_path
+        finally:
+            api.delete_repo(repo_id)
diff --git a/tests/generation/test_fused_logits_warper.py b/tests/decoder/test_fused_logits_warper.py
similarity index 100%
rename from tests/generation/test_fused_logits_warper.py
rename to tests/decoder/test_fused_logits_warper.py
diff --git a/tests/generation/conftest.py b/tests/generation/conftest.py
index 845df8adf..0d19e865d 100644
--- a/tests/generation/conftest.py
+++ b/tests/generation/conftest.py
@@ -15,22 +15,13 @@
 from tempfile import TemporaryDirectory
 
 import pytest
-from transformers import AutoTokenizer, T5ForConditionalGeneration
+from transformers import T5ForConditionalGeneration
 
-from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM
+from optimum.neuron import NeuronModelForSeq2SeqLM
 from optimum.neuron.utils.testing_utils import requires_neuronx
 from optimum.utils.testing_utils import USER
 
 
-DECODER_MODEL_ARCHITECTURES = ["bloom", "gpt2", "llama", "mistral", "mixtral", "opt"]
-DECODER_MODEL_NAMES = {
-    "bloom": "hf-internal-testing/tiny-random-BloomForCausalLM",
-    "gpt2": "hf-internal-testing/tiny-random-gpt2",
-    "llama": "dacorvo/tiny-random-llama",
-    "mistral": "dacorvo/tiny-random-MistralForCausalLM",
-    "mixtral": "dacorvo/Mixtral-tiny",
-    "opt": "hf-internal-testing/tiny-random-OPTForCausalLM",
-}
 TRN_DECODER_MODEL_ARCHITECTURES = ["bloom", "llama", "opt"]
 TRN_DECODER_MODEL_NAMES = {
     "bloom": "bigscience/bloom-560m",
@@ -45,13 +36,6 @@
 }
 
 
-@pytest.fixture(
-    scope="session", params=[DECODER_MODEL_NAMES[model_arch] for model_arch in DECODER_MODEL_ARCHITECTURES]
-)
-def export_decoder_id(request):
-    return request.param
-
-
 @pytest.fixture(
     scope="module", params=[TRN_DECODER_MODEL_NAMES[model_arch] for model_arch in TRN_DECODER_MODEL_ARCHITECTURES]
 )
@@ -69,23 +53,6 @@ def export_seq2seq_model_class(request):
     return request.param
 
 
-@pytest.fixture(scope="session")
-@requires_neuronx
-def neuron_decoder_path(export_decoder_id):
-    model = NeuronModelForCausalLM.from_pretrained(export_decoder_id, export=True, batch_size=2, num_cores=2)
-    model_dir = TemporaryDirectory()
-    model_path = model_dir.name
-    model.save_pretrained(model_path)
-    del model
-    tokenizer = AutoTokenizer.from_pretrained(export_decoder_id)
-    tokenizer.save_pretrained(model_path)
-    del tokenizer
-    # Yield instead of returning to keep a reference to the temporary directory.
-    # It will go out of scope and be released only once all tests needing the fixture
-    # have been completed.
-    yield model_path
-
-
 @pytest.fixture(scope="module")
 @requires_neuronx
 def neuron_seq2seq_beam_path(export_seq2seq_id):
diff --git a/tests/generation/test_export.py b/tests/generation/test_export.py
index 676a5e7ee..7737274ef 100644
--- a/tests/generation/test_export.py
+++ b/tests/generation/test_export.py
@@ -13,51 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from tempfile import TemporaryDirectory
 
 import pytest
-from generation_utils import check_neuron_model
-from transformers import AutoModelForCausalLM
 
-from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM
+from optimum.neuron import NeuronModelForSeq2SeqLM
 from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
 
 
-@pytest.mark.parametrize(
-    "batch_size, sequence_length, num_cores, auto_cast_type",
-    [
-        [1, 100, 2, "fp32"],
-        [1, 100, 2, "fp16"],
-        [2, 100, 2, "fp16"],
-    ],
-)
-@is_inferentia_test
-@requires_neuronx
-@pytest.mark.parametrize("local", [True, False], ids=["local", "from_hub"])
-def test_decoder_export(local, export_decoder_id, batch_size, sequence_length, num_cores, auto_cast_type):
-    export_kwargs = {
-        "batch_size": batch_size,
-        "sequence_length": sequence_length,
-        "num_cores": num_cores,
-        "auto_cast_type": auto_cast_type,
-    }
-    if local:
-        with TemporaryDirectory() as model_path:
-            model = AutoModelForCausalLM.from_pretrained(export_decoder_id)
-            model.save_pretrained(model_path)
-            model = NeuronModelForCausalLM.from_pretrained(model_path, export=True, **export_kwargs)
-    else:
-        model = NeuronModelForCausalLM.from_pretrained(export_decoder_id, export=True, **export_kwargs)
-    check_neuron_model(model, batch_size, sequence_length, num_cores, auto_cast_type)
-
-
-@is_inferentia_test
-@requires_neuronx
-def test_model_from_path(neuron_decoder_path):
-    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
-    check_neuron_model(model)
-
-
 @pytest.mark.parametrize(
     "batch_size, sequence_length, num_beams",
     [
diff --git a/tests/generation/test_hub.py b/tests/generation/test_hub.py
index fd0f127ba..7e372ad9a 100644
--- a/tests/generation/test_hub.py
+++ b/tests/generation/test_hub.py
@@ -14,33 +14,15 @@
 # limitations under the License.
 import os
 import re
-from tempfile import TemporaryDirectory
 
-import pytest
-from generation_utils import check_neuron_model
 from huggingface_hub import HfApi
 from transformers.testing_utils import ENDPOINT_STAGING
 
-from optimum.neuron import NeuronModelForCausalLM, NeuronModelForSeq2SeqLM
+from optimum.neuron import NeuronModelForSeq2SeqLM
 from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
 from optimum.utils.testing_utils import TOKEN, USER
 
 
-@is_inferentia_test
-@requires_neuronx
-@pytest.mark.parametrize(
-    "model_id, revision",
-    [
-        ["dacorvo/tiny-random-gpt2-neuronx", "1b3456cf877cc42c053ee8464f1067021eccde4b"],
-        ["dacorvo/tiny-random-gpt2-neuronx-no-checkpoint", "78eb2313ab7e149bbc22ff32257db93ba09e3033"],
-    ],
-    ids=["checkpoint", "no-checkpoint"],
-)
-def test_decoder_model_from_hub(model_id, revision):
-    model = NeuronModelForCausalLM.from_pretrained(model_id, revision=revision)
-    check_neuron_model(model, batch_size=16, sequence_length=512, num_cores=2, auto_cast_type="fp32")
-
-
 def _test_push_to_hub(model, model_path, repo_id, ignore_patterns=[]):
     model.push_to_hub(model_path, repo_id, use_auth_token=TOKEN, endpoint=ENDPOINT_STAGING)
     api = HfApi(endpoint=ENDPOINT_STAGING, token=TOKEN)
@@ -66,18 +48,6 @@ def neuron_push_model_id(model_id):
     return repo_id
 
 
-@is_inferentia_test
-@requires_neuronx
-def test_push_decoder_to_hub():
-    model_id = "hf-internal-testing/tiny-random-gpt2"
-    model = NeuronModelForCausalLM.from_pretrained(model_id, export=True)
-    with TemporaryDirectory() as tmpdir:
-        model.save_pretrained(tmpdir)
-        ignore_patterns = [model.CHECKPOINT_DIR + "/*"]
-        neuron_push_decoder_id = neuron_push_model_id(model_id)
-        _test_push_to_hub(model, tmpdir, neuron_push_decoder_id, ignore_patterns)
-
-
 @is_inferentia_test
 @requires_neuronx
 def test_seq2seq_model_from_hub():
diff --git a/tests/generation/test_tnx_generate.py b/tests/generation/test_tnx_generate.py
deleted file mode 100644
index 94b0f06c8..000000000
--- a/tests/generation/test_tnx_generate.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import pytest
-import torch
-from transformers import AutoTokenizer
-from transformers.generation import StoppingCriteria
-
-from optimum.neuron import NeuronModelForCausalLM
-from optimum.neuron.utils.testing_utils import is_inferentia_test, requires_neuronx
-
-
-def _test_generation(model, batch_size, input_length, **gen_kwargs):
-    input_ids = torch.ones((batch_size, input_length), dtype=torch.int64)
-    with torch.inference_mode():
-        sample_output = model.generate(input_ids, **gen_kwargs)
-        assert sample_output.shape[0] == batch_size
-
-
-@pytest.mark.parametrize(
-    "gen_kwargs",
-    [
-        {"do_sample": True},
-        {"do_sample": True, "temperature": 0.7},
-        {"do_sample": False},
-        {"do_sample": False, "repetition_penalty": 1.2},
-    ],
-    ids=["sample", "sample-with-temp", "greedy", "greedy_no-repeat"],
-)
-@is_inferentia_test
-@requires_neuronx
-def test_decoder_generation(neuron_decoder_path, gen_kwargs):
-    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
-    _test_generation(model, model.batch_size, 10, **gen_kwargs)
-
-
-@is_inferentia_test
-@requires_neuronx
-def test_model_generation_input_dimensions(neuron_decoder_path):
-    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
-    AutoTokenizer.from_pretrained(neuron_decoder_path)
-    # Using valid input dimensions
-    _test_generation(model, model.batch_size, model.max_length // 2)
-    # Using an incompatible batch_size
-    with pytest.raises(ValueError, match="The specified batch_size"):
-        _test_generation(model, model.batch_size + 1, model.max_length)
-    # Using an incompatible input length
-    with pytest.raises(ValueError, match="The input sequence length"):
-        _test_generation(model, model.batch_size, input_length=model.max_length * 2)
-
-
-@is_inferentia_test
-@requires_neuronx
-def test_decoder_generation_custom_stopping_criteria(neuron_decoder_path):
-    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
-
-    class CustomStoppingCriteria(StoppingCriteria):
-        def __init__(self):
-            self.called = False
-
-        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-            self.called = True
-            return True
-
-    criteria = CustomStoppingCriteria()
-    model.generate(input_ids=torch.ones([1, 10], dtype=torch.int64), stopping_criteria=[criteria])
-    assert criteria.called, "Custom StoppingCriteria should have been called"
-
-
-@is_inferentia_test
-@requires_neuronx
-def test_decoder_generation_padded_inputs(neuron_decoder_path):
-    model = NeuronModelForCausalLM.from_pretrained(neuron_decoder_path)
-    assert model.batch_size >= 2
-    tokenizer = AutoTokenizer.from_pretrained(neuron_decoder_path)
-    prompt = (
-        "It was a bright cold day in April, and the clocks were striking thirteen."
-        " Winston Smith, his chin nuzzled into his breast in an effort to escape the"
-        " vile wind, slipped quickly through the glass doors of Victory Mansions,"
-    )
-    first_input = tokenizer(prompt)
-    first_ids = first_input["input_ids"]
-    first_mask = first_input["attention_mask"]
-    max_padding = 12
-    input_len = len(first_ids)
-    for i in range(max_padding):
-        second_ids = [tokenizer.eos_token_id] * i + first_ids[: input_len - i]
-        second_mask = [0] * i + [1] * (input_len - i)
-        input_ids = torch.tensor([first_ids, second_ids], dtype=torch.int64)
-        attention_mask = torch.tensor([first_mask, second_mask], dtype=torch.int64)
-        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, do_sample=False)
-        # Verify we did not generate any unknown token
-        assert torch.all(outputs[:, -1] != 0)
diff --git a/text-generation-inference/Dockerfile b/text-generation-inference/Dockerfile
index 0846ce1a4..2941fdd6e 100644
--- a/text-generation-inference/Dockerfile
+++ b/text-generation-inference/Dockerfile
@@ -20,6 +20,8 @@ COPY --from=tgi /tgi/proto proto
 COPY --from=tgi /tgi/benchmark benchmark
 COPY --from=tgi /tgi/router router
 COPY --from=tgi /tgi/launcher launcher
+# Remove the next line when bumping rust version
+RUN cargo update ravif --precise 0.11.6
 RUN cargo chef prepare --recipe-path recipe.json
 
 FROM chef AS builder