Skip to content

Commit

Permalink
Fix MPMD detected error during training with TP (#648)
Browse files Browse the repository at this point in the history
  • Loading branch information
michaelbenayoun authored Jul 5, 2024
1 parent 20a49e9 commit 281bad8
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 8 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/test_trainium_common.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ jobs:
- name: Run tests on Neuron cores
run: |
source aws_neuron_venv_pytorch/bin/activate
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --durations=0
HF_TOKEN=${{ secrets.HF_TOKEN_OPTIMUM_NEURON_CI }} USE_VENV="false" pytest -m "is_trainium_test" $TESTS_TO_IGNORE_FLAGS tests --durations=0 -v
- name: Collect staging tests on Neuron Cores
run: |
source aws_neuron_venv_pytorch/bin/activate
Expand Down
9 changes: 7 additions & 2 deletions optimum/neuron/accelerate/accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -521,8 +521,13 @@ def autocast(self, cache_enabled: bool = False, autocast_handler: Optional[Autoc
# - `self.state.mixed_precision == "bf16"`
# - `self.state.autocast_backend is AutocastBackend.AMP`
autocast_handler = self.autocast_handler
autocast_kwargs = autocast_handler.to_kwargs()
autocast_context = torch.autocast(dtype=torch.bfloat16, device_type="cuda", **autocast_kwargs)

if autocast_handler.enabled:
autocast_kwargs = autocast_handler.to_kwargs()
autocast_context = torch.autocast(dtype=torch.bfloat16, device_type="cuda", **autocast_kwargs)
else:
autocast_context = contextlib.nullcontext()

autocast_context.__enter__()
yield
autocast_context.__exit__(*sys.exc_info())
Expand Down
4 changes: 0 additions & 4 deletions optimum/neuron/trainers.py
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,6 @@ def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[s
return inputs

def compute_loss(self, model, inputs, return_outputs: bool = False):
self.state.last_inputs = inputs
from neuronx_distributed.pipeline import NxDPPModel

if isinstance(model, NxDPPModel):
Expand All @@ -368,7 +367,6 @@ def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True):
A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired
arguments, depending on the situation.
"""

autocast_handler = AutocastKwargs(
enabled=self.accelerator.autocast_handler.enabled,
cache_enabled=cache_enabled,
Expand Down Expand Up @@ -408,8 +406,6 @@ def prediction_step(
) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]:
from neuronx_distributed.pipeline import NxDPPModel

self.state.last_inputs = inputs

if isinstance(model, NxDPPModel):
if not prediction_loss_only:
raise ValueError("Only the prediction loss can be returned when doing pipeline parallelism.")
Expand Down
2 changes: 1 addition & 1 deletion optimum/neuron/utils/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ class ExampleRunner:
"task_name": "sst2",
},
"token-classification": {
"dataset_name": "conll2003",
"dataset_name": "bnsapa/cybersecurity-ner",
"set_max_length": True,
"extra_command_line_arguments": [
"--pad_to_max_length",
Expand Down

0 comments on commit 281bad8

Please sign in to comment.