Skip to content

Commit

Permalink
refactor: improved less verbose logging
Browse files Browse the repository at this point in the history
  • Loading branch information
le1nux committed Jul 12, 2024
1 parent 55a896f commit b380a66
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 21 deletions.
4 changes: 2 additions & 2 deletions config_files/training/config_lorem_ipsum.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ train_dataloader:
num_workers: 2
pin_memory: true
shuffle: false
dataloader_tag: "train"
dataloader_tag: train
dataset:
instance_key: train_dataset
pass_type: BY_REFERENCE
Expand Down Expand Up @@ -264,7 +264,7 @@ optimizer:
betas: [0.9, 0.95]
eps: 1e-8
weight_decay: 1e-1
weight_decay_groups_excluded: ["embedding", "layernorm"]
weight_decay_groups_excluded: [embedding, layernorm]
wrapped_model:
instance_key: wrapped_model
pass_type: BY_REFERENCE
Expand Down
12 changes: 5 additions & 7 deletions src/modalities/batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,9 @@ class EvaluationResultBatch(Batch):
throughput_metrics: Dict[str, torch.Tensor] = field(default_factory=lambda: dict())

def __str__(self) -> str:
eval_str = f"Evaluation result on dataset tag {self.dataloader_tag} after {self.num_train_steps_done} steps:"
eval_str += "\n\nlosses: " + "\n\t".join([f"{k}: {v.mean().item()}" for k, v in self.losses.items()])
eval_str += "\n\nmetrics: " + "\n\t".join([f"{k}: {v.mean().item()}" for k, v in self.metrics.items()])
eval_str += "\n\nthroughput metrics: " + "\n\t".join(
[f"{k}: {v.mean().item()}" for k, v in self.throughput_metrics.items()]
)
eval_str += "\n==============================================="
eval_str = f"Dataloader: {self.dataloader_tag} | "
eval_str = f"step: {self.num_train_steps_done} | "
eval_str += " | ".join([f"{k}: {v.mean().item()}" for k, v in self.throughput_metrics.items()]) + " | "
eval_str += " | ".join([f"{k}: {v.mean().item()}" for k, v in self.losses.items()]) + " | "
eval_str += " | ".join([f"{k}: {v.mean().item()}" for k, v in self.metrics.items()]) + " | "
return eval_str
21 changes: 9 additions & 12 deletions src/modalities/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,10 +141,6 @@ def train(
num_train_steps_done=num_train_steps_done,
dataloader_tag=train_loader.dataloader_tag,
)
print(
f"num_train_steps_done: {num_train_steps_done}, micro_batch_id: {micro_batch_id}",
f" (micro_batch_id +1) % GAS: {(micro_batch_id +1) % self.gradient_acc_steps}",
)
# Check if model performance should be logged
if num_train_steps_done % training_log_interval_in_steps == 0 and step_performed:
forward_backward_time = torch.tensor(forward_backward_time_recorder.delta_t).to(device)
Expand Down Expand Up @@ -175,15 +171,15 @@ def train(
reduced_losses[1],
)
losses = {
f"{loss_fun.tag} average": train_loss_avg,
f"{loss_fun.tag} last step": train_loss_last_batch,
"train loss avg": train_loss_avg,
"train loss last": train_loss_last_batch,
}

consumed_tokens = torch.Tensor([num_train_steps_done * self.global_num_tokens_per_train_step])
metrics = {
"consumed_tokens": consumed_tokens,
"grad_norm_avg": torch.mean(torch.Tensor(gradient_norm_scores)),
"grad_norm_last_batch": gradient_norm_scores[-1],
"consumed tokens": consumed_tokens,
"grad norm avg": torch.mean(torch.Tensor(gradient_norm_scores)),
"grad norm last": torch.tensor(gradient_norm_scores[-1]),
}
gradient_norm_scores = []

Expand All @@ -192,13 +188,14 @@ def train(
metrics=metrics,
# TODO: hardcoded metric key
throughput_metrics={
"training_synced_num_samples_per_second": synced_num_samples_per_second,
"lr_mean": torch.tensor(scheduler.get_last_lr()).mean(),
"lr_first": torch.tensor(scheduler.get_last_lr())[0],
"train samples/s": synced_num_samples_per_second,
"lr mean": torch.tensor(scheduler.get_last_lr()).mean(),
},
dataloader_tag=train_loader.dataloader_tag,
num_train_steps_done=num_train_steps_done,
)
if self.local_rank == 0:
print(training_metrics)
self._publish_evaluation_result(
evaluation_result_publisher=self.evaluation_result_publisher,
evaluation_result=training_metrics,
Expand Down

0 comments on commit b380a66

Please sign in to comment.