Fix MPMD detected error during training with TP (#648)

huggingface · Jul 5, 2024 · 281bad8 · 281bad8
1 parent 20a49e9
commit 281bad8
Show file tree

Hide file tree

Showing 4 changed files with 9 additions and 8 deletions.
diff --git a/.github/workflows/test_trainium_common.yml b/.github/workflows/test_trainium_common.yml
@@ -52,7 +52,7 @@ jobs:
       - name: Run tests on Neuron cores
         run: |
           source aws_neuron_venv_pytorch/bin/activate
-          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --durations=0
+          HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --durations=0 -v
       - name: Collect staging tests on Neuron Cores
         run: |
           source aws_neuron_venv_pytorch/bin/activate

diff --git a/optimum/neuron/accelerate/accelerator.py b/optimum/neuron/accelerate/accelerator.py
@@ -521,8 +521,13 @@ def autocast(self, cache_enabled: bool = False, autocast_handler: Optional[Autoc
             #   - `self.state.mixed_precision == "bf16"`
             #   - `self.state.autocast_backend is AutocastBackend.AMP`
             autocast_handler = self.autocast_handler
-        autocast_kwargs = autocast_handler.to_kwargs()
-        autocast_context = torch.autocast(dtype=torch.bfloat16, device_type="cuda", **autocast_kwargs)
+
+        if autocast_handler.enabled:
+            autocast_kwargs = autocast_handler.to_kwargs()
+            autocast_context = torch.autocast(dtype=torch.bfloat16, device_type="cuda", **autocast_kwargs)
+        else:
+            autocast_context = contextlib.nullcontext()
+
         autocast_context.__enter__()
         yield
         autocast_context.__exit__(*sys.exc_info())

diff --git a/optimum/neuron/trainers.py b/optimum/neuron/trainers.py
@@ -353,7 +353,6 @@ def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[s
         return inputs
 
     def compute_loss(self, model, inputs, return_outputs: bool = False):
-        self.state.last_inputs = inputs
         from neuronx_distributed.pipeline import NxDPPModel
 
         if isinstance(model, NxDPPModel):
@@ -368,7 +367,6 @@ def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True):
         A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
         arguments, depending on the situation.
         """
-
         autocast_handler = AutocastKwargs(
             enabled=self.accelerator.autocast_handler.enabled,
             cache_enabled=cache_enabled,
@@ -408,8 +406,6 @@ def prediction_step(
     ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
         from neuronx_distributed.pipeline import NxDPPModel
 
-        self.state.last_inputs = inputs
-
         if isinstance(model, NxDPPModel):
             if not prediction_loss_only:
                 raise ValueError("Only the prediction loss can be returned when doing pipeline parallelism.")

diff --git a/optimum/neuron/utils/runner.py b/optimum/neuron/utils/runner.py
@@ -125,7 +125,7 @@ class ExampleRunner:
             "task_name": "sst2",
         },
         "token-classification": {
-            "dataset_name": "conll2003",
+            "dataset_name": "bnsapa/cybersecurity-ner",
             "set_max_length": True,
             "extra_command_line_arguments": [
                 "--pad_to_max_length",