diff --git a/config_files/training/config_lorem_ipsum.yaml b/config_files/training/config_lorem_ipsum.yaml index fc1d8926..e4251a2f 100644 --- a/config_files/training/config_lorem_ipsum.yaml +++ b/config_files/training/config_lorem_ipsum.yaml @@ -42,7 +42,7 @@ train_dataloader: num_workers: 2 pin_memory: true shuffle: false - dataloader_tag: "train" + dataloader_tag: train dataset: instance_key: train_dataset pass_type: BY_REFERENCE @@ -264,7 +264,7 @@ optimizer: betas: [0.9, 0.95] eps: 1e-8 weight_decay: 1e-1 - weight_decay_groups_excluded: ["embedding", "layernorm"] + weight_decay_groups_excluded: [embedding, layernorm] wrapped_model: instance_key: wrapped_model pass_type: BY_REFERENCE diff --git a/src/modalities/batch.py b/src/modalities/batch.py index 5a47511c..633dc269 100644 --- a/src/modalities/batch.py +++ b/src/modalities/batch.py @@ -105,11 +105,9 @@ class EvaluationResultBatch(Batch): throughput_metrics: Dict[str, torch.Tensor] = field(default_factory=lambda: dict()) def __str__(self) -> str: - eval_str = f"Evaluation result on dataset tag {self.dataloader_tag} after {self.num_train_steps_done} steps:" - eval_str += "\n\nlosses: " + "\n\t".join([f"{k}: {v.mean().item()}" for k, v in self.losses.items()]) - eval_str += "\n\nmetrics: " + "\n\t".join([f"{k}: {v.mean().item()}" for k, v in self.metrics.items()]) - eval_str += "\n\nthroughput metrics: " + "\n\t".join( - [f"{k}: {v.mean().item()}" for k, v in self.throughput_metrics.items()] - ) - eval_str += "\n===============================================" + eval_str = f"Dataloader: {self.dataloader_tag} | " + eval_str = f"step: {self.num_train_steps_done} | " + eval_str += " | ".join([f"{k}: {v.mean().item()}" for k, v in self.throughput_metrics.items()]) + " | " + eval_str += " | ".join([f"{k}: {v.mean().item()}" for k, v in self.losses.items()]) + " | " + eval_str += " | ".join([f"{k}: {v.mean().item()}" for k, v in self.metrics.items()]) + " | " return eval_str diff --git a/src/modalities/trainer.py b/src/modalities/trainer.py index 51aaa273..2f8a72ad 100644 --- a/src/modalities/trainer.py +++ b/src/modalities/trainer.py @@ -141,10 +141,6 @@ def train( num_train_steps_done=num_train_steps_done, dataloader_tag=train_loader.dataloader_tag, ) - print( - f"num_train_steps_done: {num_train_steps_done}, micro_batch_id: {micro_batch_id}", - f" (micro_batch_id +1) % GAS: {(micro_batch_id +1) % self.gradient_acc_steps}", - ) # Check if model performance should be logged if num_train_steps_done % training_log_interval_in_steps == 0 and step_performed: forward_backward_time = torch.tensor(forward_backward_time_recorder.delta_t).to(device) @@ -175,15 +171,15 @@ def train( reduced_losses[1], ) losses = { - f"{loss_fun.tag} average": train_loss_avg, - f"{loss_fun.tag} last step": train_loss_last_batch, + "train loss avg": train_loss_avg, + "train loss last": train_loss_last_batch, } consumed_tokens = torch.Tensor([num_train_steps_done * self.global_num_tokens_per_train_step]) metrics = { - "consumed_tokens": consumed_tokens, - "grad_norm_avg": torch.mean(torch.Tensor(gradient_norm_scores)), - "grad_norm_last_batch": gradient_norm_scores[-1], + "consumed tokens": consumed_tokens, + "grad norm avg": torch.mean(torch.Tensor(gradient_norm_scores)), + "grad norm last": torch.tensor(gradient_norm_scores[-1]), } gradient_norm_scores = [] @@ -192,13 +188,14 @@ def train( metrics=metrics, # TODO: hardcoded metric key throughput_metrics={ - "training_synced_num_samples_per_second": synced_num_samples_per_second, - "lr_mean": torch.tensor(scheduler.get_last_lr()).mean(), - "lr_first": torch.tensor(scheduler.get_last_lr())[0], + "train samples/s": synced_num_samples_per_second, + "lr mean": torch.tensor(scheduler.get_last_lr()).mean(), }, dataloader_tag=train_loader.dataloader_tag, num_train_steps_done=num_train_steps_done, ) + if self.local_rank == 0: + print(training_metrics) self._publish_evaluation_result( evaluation_result_publisher=self.evaluation_result_publisher, evaluation_result=training_metrics,