From 281bad805cce36df8b1e0600b5568228c0130f7d Mon Sep 17 00:00:00 2001
From: Michael Benayoun <michael@huggingface.co>
Date: Fri, 5 Jul 2024 10:24:49 +0200
Subject: [PATCH] Fix MPMD detected error during training with TP (#648)

---
 .github/workflows/test_trainium_common.yml | 2 +-
 optimum/neuron/accelerate/accelerator.py   | 9 +++++++--
 optimum/neuron/trainers.py                 | 4 ----
 optimum/neuron/utils/runner.py             | 2 +-
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test_trainium_common.yml b/.github/workflows/test_trainium_common.yml
index c2bbe2419..738d4d482 100644
--- a/.github/workflows/test_trainium_common.yml
+++ b/.github/workflows/test_trainium_common.yml
@@ -52,7 +52,7 @@ jobs:
       - name: Run tests on Neuron cores
         run: |
           source aws_neuron_venv_pytorch/bin/activate
-          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --durations=0
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --durations=0 -v
       - name: Collect staging tests on Neuron Cores
         run: |
           source aws_neuron_venv_pytorch/bin/activate
diff --git a/optimum/neuron/accelerate/accelerator.py b/optimum/neuron/accelerate/accelerator.py
index ee08423cf..c3ef669f3 100644
--- a/optimum/neuron/accelerate/accelerator.py
+++ b/optimum/neuron/accelerate/accelerator.py
@@ -521,8 +521,13 @@ def autocast(self, cache_enabled: bool = False, autocast_handler: Optional[Autoc
             #   - `self.state.mixed_precision == "bf16"`
             #   - `self.state.autocast_backend is AutocastBackend.AMP`
             autocast_handler = self.autocast_handler
-        autocast_kwargs = autocast_handler.to_kwargs()
-        autocast_context = torch.autocast(dtype=torch.bfloat16, device_type="cuda", **autocast_kwargs)
+
+        if autocast_handler.enabled:
+            autocast_kwargs = autocast_handler.to_kwargs()
+            autocast_context = torch.autocast(dtype=torch.bfloat16, device_type="cuda", **autocast_kwargs)
+        else:
+            autocast_context = contextlib.nullcontext()
+
         autocast_context.__enter__()
         yield
         autocast_context.__exit__(*sys.exc_info())
diff --git a/optimum/neuron/trainers.py b/optimum/neuron/trainers.py
index 873838ec3..237018c9f 100755
--- a/optimum/neuron/trainers.py
+++ b/optimum/neuron/trainers.py
@@ -353,7 +353,6 @@ def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[s
         return inputs
 
     def compute_loss(self, model, inputs, return_outputs: bool = False):
-        self.state.last_inputs = inputs
         from neuronx_distributed.pipeline import NxDPPModel
 
         if isinstance(model, NxDPPModel):
@@ -368,7 +367,6 @@ def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True):
         A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
         arguments, depending on the situation.
         """
-
         autocast_handler = AutocastKwargs(
             enabled=self.accelerator.autocast_handler.enabled,
             cache_enabled=cache_enabled,
@@ -408,8 +406,6 @@ def prediction_step(
     ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
         from neuronx_distributed.pipeline import NxDPPModel
 
-        self.state.last_inputs = inputs
-
         if isinstance(model, NxDPPModel):
             if not prediction_loss_only:
                 raise ValueError("Only the prediction loss can be returned when doing pipeline parallelism.")
diff --git a/optimum/neuron/utils/runner.py b/optimum/neuron/utils/runner.py
index aa4f5a634..42e599d3a 100644
--- a/optimum/neuron/utils/runner.py
+++ b/optimum/neuron/utils/runner.py
@@ -125,7 +125,7 @@ class ExampleRunner:
             "task_name": "sst2",
         },
         "token-classification": {
-            "dataset_name": "conll2003",
+            "dataset_name": "bnsapa/cybersecurity-ner",
             "set_max_length": True,
             "extra_command_line_arguments": [
                 "--pad_to_max_length",