Skip to content

Commit

Permalink
Merge branch 'main' into huggingface_models_support
Browse files Browse the repository at this point in the history
  • Loading branch information
le1nux committed Feb 9, 2024
2 parents f435fc8 + da65493 commit e93e767
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 6 deletions.
8 changes: 4 additions & 4 deletions src/modalities/activation_checkpointing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,14 @@
from modalities.models.gpt2.gpt2_model import GPT2Block


def is_module_to_apply_activation_checkpointing(submodule: torch.nn.Module):
def is_module_to_apply_activation_checkpointing(submodule: torch.nn.Module) -> bool:
return isinstance(submodule, GPT2Block)


def apply_activation_checkpointing_inplace(model: torch.nn.Module) -> None:
def apply_activation_checkpointing_inplace(model: torch.nn.Module):
assert isinstance(model, FSDP), "activation checkpointing can only be applied to FSDP wrapped models!"
non_reentrant_wrapper = partial(checkpoint_wrapper, checkpoint_impl=CheckpointImpl.NO_REENTRANT, debug=True)
non_reentrant_wrapper = partial(checkpoint_wrapper, checkpoint_impl=CheckpointImpl.NO_REENTRANT, debug=False)

return apply_activation_checkpointing(
apply_activation_checkpointing(
model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=is_module_to_apply_activation_checkpointing
)
7 changes: 5 additions & 2 deletions src/modalities/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def train(
optimizer,
loss_fun: Loss,
callback_interval_in_batches: int,
# TODO: remove
epoch_done_callback: Callable[[int], None],
local_sample_id_to_global_sample_id: Callable[[int], int],
):
Expand All @@ -67,12 +68,14 @@ def train(

# batch loop
batch: DatasetBatch
# TODO: why do we need a barrier here?
dist.barrier()
forward_backward_time_recorder = TimeRecorder()
forward_backward_time_recorder.start()
for batch_id, batch in enumerate(train_loader):
# Because we might resume training, we add the starting batch id of the data loader
local_train_batch_id = batch_id + train_loader.fast_forward_batch_id
# train single batch
# Train single batch
batch_loss = self._train_batch(
batch=batch,
model=model,
Expand All @@ -82,7 +85,7 @@ def train(
data_loader=train_loader,
)
forward_backward_time_recorder.stop()
# save the batch loss
# Save the batch loss
cummulated_loss[0] += batch_loss.item()
cummulated_loss[1] += len(batch)
batch_length_tensor = torch.tensor(len(batch)).to(torch.device(self.local_rank))
Expand Down

0 comments on commit e93e767

Please sign in to comment.