From 281bad805cce36df8b1e0600b5568228c0130f7d Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Fri, 5 Jul 2024 10:24:49 +0200 Subject: [PATCH] Fix MPMD detected error during training with TP (#648) --- .github/workflows/test_trainium_common.yml | 2 +- optimum/neuron/accelerate/accelerator.py | 9 +++++++-- optimum/neuron/trainers.py | 4 ---- optimum/neuron/utils/runner.py | 2 +- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/test_trainium_common.yml b/.github/workflows/test_trainium_common.yml index c2bbe2419..738d4d482 100644 --- a/.github/workflows/test_trainium_common.yml +++ b/.github/workflows/test_trainium_common.yml @@ -52,7 +52,7 @@ jobs: - name: Run tests on Neuron cores run: | source aws_neuron_venv_pytorch/bin/activate - HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --durations=0 + HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --durations=0 -v - name: Collect staging tests on Neuron Cores run: | source aws_neuron_venv_pytorch/bin/activate diff --git a/optimum/neuron/accelerate/accelerator.py b/optimum/neuron/accelerate/accelerator.py index ee08423cf..c3ef669f3 100644 --- a/optimum/neuron/accelerate/accelerator.py +++ b/optimum/neuron/accelerate/accelerator.py @@ -521,8 +521,13 @@ def autocast(self, cache_enabled: bool = False, autocast_handler: Optional[Autoc # - `self.state.mixed_precision == "bf16"` # - `self.state.autocast_backend is AutocastBackend.AMP` autocast_handler = self.autocast_handler - autocast_kwargs = autocast_handler.to_kwargs() - autocast_context = torch.autocast(dtype=torch.bfloat16, device_type="cuda", **autocast_kwargs) + + if autocast_handler.enabled: + autocast_kwargs = autocast_handler.to_kwargs() + autocast_context = torch.autocast(dtype=torch.bfloat16, device_type="cuda", **autocast_kwargs) + else: + autocast_context = contextlib.nullcontext() + autocast_context.__enter__() yield autocast_context.__exit__(*sys.exc_info()) diff --git a/optimum/neuron/trainers.py b/optimum/neuron/trainers.py index 873838ec3..237018c9f 100755 --- a/optimum/neuron/trainers.py +++ b/optimum/neuron/trainers.py @@ -353,7 +353,6 @@ def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[s return inputs def compute_loss(self, model, inputs, return_outputs: bool = False): - self.state.last_inputs = inputs from neuronx_distributed.pipeline import NxDPPModel if isinstance(model, NxDPPModel): @@ -368,7 +367,6 @@ def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True): A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired arguments, depending on the situation. """ - autocast_handler = AutocastKwargs( enabled=self.accelerator.autocast_handler.enabled, cache_enabled=cache_enabled, @@ -408,8 +406,6 @@ def prediction_step( ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: from neuronx_distributed.pipeline import NxDPPModel - self.state.last_inputs = inputs - if isinstance(model, NxDPPModel): if not prediction_loss_only: raise ValueError("Only the prediction loss can be returned when doing pipeline parallelism.") diff --git a/optimum/neuron/utils/runner.py b/optimum/neuron/utils/runner.py index aa4f5a634..42e599d3a 100644 --- a/optimum/neuron/utils/runner.py +++ b/optimum/neuron/utils/runner.py @@ -125,7 +125,7 @@ class ExampleRunner: "task_name": "sst2", }, "token-classification": { - "dataset_name": "conll2003", + "dataset_name": "bnsapa/cybersecurity-ner", "set_max_length": True, "extra_command_line_arguments": [ "--pad_to_max_length",